inching closer to the truth

2024-10-09 15:16:53 +02:00
parent 4b6bf0f208
commit f8c8f7ef0c
9 changed files with 82 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,8 @@
 venv/
 *.yy.*
 *.out
+*.bin
 __pycache__/
 *.norm
-data/linux/
+training_set/linux/
 *.pkl
--- a/config.py
+++ b/config.py
@ -2,5 +2,5 @@ LINE_WIDTH = 80
 MAX_SHIMS  = LINE_WIDTH - 1
 SOURCE_LINE_BATCH_SIZE = 3

-COMPILE_INPUT_DIRECTORY = "data/linux/"
-MODEL_DIRECTORY = "models/"
+COMPILE_INPUT_DIRECTORY = "training_set/linux/"
+MODEL_DIRECTORY = "trained_models/"
--- a/converter.l
+++ b/converter.l
@ -144,7 +144,12 @@ special {comment_marker}|{assignment}|{shift}|{modify}

 signed main(const int argc, const char * const * const argv) {
    if (argc < 3) {
-        puts("Usage: converter <mode> <file>");
+        puts(
+			"Usage:\n"
+			"$ converter normalize  [<file>|^<string>]\n"
+			"$ converter accumulate [<file>|^<string>]\n"
+			"$ converter build      [<file>|^<string>] <schemantic-file>\n"
+		);
        return 1;
    }

@ -155,17 +160,33 @@ signed main(const int argc, const char * const * const argv) {
        mystate = ACCUMULATE;
    } else
    if (!strcmp(argv[1], "build")) {
+		if (argc < 4) { exit(4); }
        mystate = BUILD;
-        build_file = fopen("build_file", "rb");
+        build_file = fopen(argv[3], "rb");
 		if (!build_file) { exit(1); }
        STEP_SCHEMANTIC;
    } else {
        return 1;
    }

-    yyin = fopen(argv[2], "r");
+	char * input;
+	if (argv[2][0] == '^') {
+		input = (char*)argv[2]+1;
+	} else {
+		FILE * f = fopen(argv[2], "r");
+		if(!f){ exit(3); }
+		fseek(f, 0, SEEK_END);
+		int flen = ftell(f);
+		rewind(f);
+		input = malloc(flen+1);
+		input[flen] = '\00';
+		fread(input, flen, sizeof(char), f);
+		fclose(f);
+	}

+	YY_BUFFER_STATE const b = yy_scan_string(input);
    yylex();
+	yy_delete_buffer(b);

    return 0;
 }
--- a/data.py
+++ b/data.py
@ -7,20 +7,24 @@ from sys import argv
 from config import *
 import tard_wrangler

-MAX_DATA_LIMIT = sys.maxsize
+#MAX_DATA_LIMIT = sys.maxsize
+MAX_DATA_LIMIT = 1000

-def get_source(path : str) -> [str]:
+DATASET_FILE = "training_set/dataset-linux.pkl"
+
+def get_source(path : str, normpath : str) -> [str]:
 	'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
 	r = []
 	# read data
-	with open(path, 'r') as file: lines = [line[:-1] for line in file]
+	with open(path, 'r') as f: lines = [line[:-1] for line in f]
+	with open(normpath, 'r') as f: normlines = [line[:-1] for line in f]
 	# pad with empty lines
 	for i in range(int((SOURCE_LINE_BATCH_SIZE-1)/2)):
 		lines.insert(0, "")
-		lines.append("")
+		normlines.append("")
 	# batch
-	for i in range(len(lines)-2):
-		r.append(lines[i:i+SOURCE_LINE_BATCH_SIZE])
+	for i in range(len(lines)-1):
+		r.append([lines[i]] + normlines[i:i+SOURCE_LINE_BATCH_SIZE-1])
 	return r

 def source_to_np_array(source_batches : []) -> np.array:
@ -44,7 +48,8 @@ def read_acc(path : str) -> [[int]]:
 		for line in file:
 			try:
 				l = eval(line)
-				l = l + [0] * (MAX_SHIMS - len(l))
+				if len(l) < MAX_SHIMS: l = l + [0] * (MAX_SHIMS - len(l))
+				else: l = l[:MAX_SHIMS]
 				r.append(l)
 			except: pass
 	return r
@ -54,27 +59,28 @@ def whitespace_to_np_array(spaces : []) -> np.array:
 	r = np.array(r).reshape(len(spaces), -1)
 	return r

-def compile_data():
+def compile_data(from_dir : str) -> {}:
 	r = {'in': [], 'out': [], 'src': []}
-	for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
-		if n > MAX_DATA_LIMIT: break # XXX
+	for n, path in enumerate(glob(from_dir + "/*.c")):
+		if n > MAX_DATA_LIMIT: break
 		acc_path  = path + ".acc"
 		norm_path = path + ".norm"
-		r['src'].append(path)
-		source_batches = get_source(norm_path)
+		source_batches = get_source(path, norm_path)
 		accumulation   = read_acc(acc_path)
-		assert len(source_batches) == len(accumulation), (
-			f"Some retard fucked up strings in {path}."
-		)
+		if len(source_batches) != len(accumulation):
+			print(f"WARNING: Some retard fucked up strings in {path}")
+			continue
+		r['src'].append(path)
 		r['in']  += source_batches
 		r['out'] += accumulation
+		print(f"INFO: Read data from ({n}) {path}")
 	r['in']  = source_to_np_array(r['in'])
 	r['out'] = whitespace_to_np_array(r['out'])
 	return r

-def get_data():
-	r = []
-	with open('dataset-linux.pkl', 'rb') as f: r = pickle.load(f)
+def get_data(dataset_file : str) -> {}:
+	r = {}
+	with open(dataset_file, 'rb') as f: r = pickle.load(f)
 	assert len(r['in']) == len(r['out']), (
 			"data in and out sizes were inconsistent ("
 			+ str(r['in'].shape)
@ -86,7 +92,6 @@ def get_data():

 if __name__ == "__main__":
 	if len(argv) == 2 and argv[1] == 'c': # clean compile
-		with open('dataset-linux.pkl', 'wb') as f: pickle.dump(compile_data(), f)
-	dataset = get_data()
-	print(dataset)
+		with open(DATASET_FILE, 'wb') as f: pickle.dump(compile_data(COMPILE_INPUT_DIRECTORY), f)
+	dataset = get_data(DATASET_FILE)
 	print(dataset['in'].shape, dataset['out'].shape)
--- a/formatter.py
+++ b/formatter.py
@ -10,11 +10,9 @@ import tard_wrangler
 if len(argv) > 1:
 	mymodel = model.load_model(argv[1])
 else:
-	dataset = data.get_data()
+	dataset = data.get_data("dataset-linux.pkl")
 	mymodel = model.make_model(dataset)
 	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
 	mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")

-predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel)
-tard_wrangler.build("data/xop.c.norm", predictions)
-tard_wrangler.cat_build()
+print(tard_wrangler.full_predict("training_set/xop.c", "training_set/xop.c.norm", mymodel))
--- a/tard_wrangler.py
+++ b/tard_wrangler.py
@ -1,29 +1,35 @@
 import subprocess
+import shlex
 import numpy as np

 from config import *
 import data

-def accumulate(path : str, output : str) -> None:
-	process = subprocess.Popen(
-				"converter.out accumulate " + path + " > " + output,
-				shell=True,
-	)
+BUILD_FILE = "build_file.bin"

-def full_predict(path : str, model) -> []:
-	r = []
-	myinput = data.source_to_np_array(data.get_source(path))
-	for i in myinput:
-		r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes()
+def build(what : str, predictions : []) -> None:
+	print(predictions)
+	predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
+	with open(BUILD_FILE, "wb") as f: f.write(predictions)
+	shell_what = shlex.quote(what)
+	shell_what = shell_what[0] + '^' + shell_what[1:]
+	process = subprocess.Popen(
+				"converter.out build " + shell_what + " " + BUILD_FILE,
+				shell=True,
+				stdout=subprocess.PIPE,
+	)
+	r, _ = process.communicate()
+	r = r.decode('utf-8')
 	return r

-def build(path : str, predictions : []) -> None:
-	predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
-	with open("build_file", "wb") as f: f.write(predictions)
-	process = subprocess.Popen(
-				"converter.out build " + path + " > out.c",
-				shell=True,
-	)
-
-def cat_build():
-	with open("out.c") as f: print(f.read())
+def full_predict(path : str, normpath : str, model) -> [str]:
+	r = ["\n"]
+	batches = data.get_source(path, normpath)
+	for b in batches:
+		b[0] = r[-1]
+		myinput = data.source_to_np_array([b])
+		prediction = model.predict(myinput).astype(np.uint8).tobytes()
+		predicted_string = build(b[1], prediction)
+		r += predicted_string + "\n"
+	r = ''.join(r)
+	return r
--- a/trained_models/.gitkeep
+++ b/trained_models/.gitkeep
--- a/training_set/assignments.list
+++ b/training_set/assignments.list
--- a/training_set/xop.c
+++ b/training_set/xop.c