diff --git a/.gitignore b/.gitignore index 9c6ac94..d39c15c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ venv/ *.yy.* *.out +*.bin __pycache__/ *.norm -data/linux/ +training_set/linux/ *.pkl diff --git a/config.py b/config.py index 777bc2d..21ba0b1 100644 --- a/config.py +++ b/config.py @@ -2,5 +2,5 @@ LINE_WIDTH = 80 MAX_SHIMS = LINE_WIDTH - 1 SOURCE_LINE_BATCH_SIZE = 3 -COMPILE_INPUT_DIRECTORY = "data/linux/" -MODEL_DIRECTORY = "models/" +COMPILE_INPUT_DIRECTORY = "training_set/linux/" +MODEL_DIRECTORY = "trained_models/" diff --git a/converter.l b/converter.l index b06bae6..d5917de 100644 --- a/converter.l +++ b/converter.l @@ -144,7 +144,12 @@ special {comment_marker}|{assignment}|{shift}|{modify} signed main(const int argc, const char * const * const argv) { if (argc < 3) { - puts("Usage: converter "); + puts( + "Usage:\n" + "$ converter normalize [|^]\n" + "$ converter accumulate [|^]\n" + "$ converter build [|^] \n" + ); return 1; } @@ -155,17 +160,33 @@ signed main(const int argc, const char * const * const argv) { mystate = ACCUMULATE; } else if (!strcmp(argv[1], "build")) { + if (argc < 4) { exit(4); } mystate = BUILD; - build_file = fopen("build_file", "rb"); + build_file = fopen(argv[3], "rb"); if (!build_file) { exit(1); } STEP_SCHEMANTIC; } else { return 1; } - yyin = fopen(argv[2], "r"); + char * input; + if (argv[2][0] == '^') { + input = (char*)argv[2]+1; + } else { + FILE * f = fopen(argv[2], "r"); + if(!f){ exit(3); } + fseek(f, 0, SEEK_END); + int flen = ftell(f); + rewind(f); + input = malloc(flen+1); + input[flen] = '\00'; + fread(input, flen, sizeof(char), f); + fclose(f); + } + YY_BUFFER_STATE const b = yy_scan_string(input); yylex(); + yy_delete_buffer(b); return 0; } diff --git a/data.py b/data.py index eb475c4..c53303f 100644 --- a/data.py +++ b/data.py @@ -7,20 +7,24 @@ from sys import argv from config import * import tard_wrangler -MAX_DATA_LIMIT = sys.maxsize +#MAX_DATA_LIMIT = sys.maxsize +MAX_DATA_LIMIT = 1000 -def get_source(path : str) -> [str]: +DATASET_FILE = "training_set/dataset-linux.pkl" + +def get_source(path : str, normpath : str) -> [str]: '''returns source file in $SOURCE_LINE_BATCH_SIZE line batches''' r = [] # read data - with open(path, 'r') as file: lines = [line[:-1] for line in file] + with open(path, 'r') as f: lines = [line[:-1] for line in f] + with open(normpath, 'r') as f: normlines = [line[:-1] for line in f] # pad with empty lines for i in range(int((SOURCE_LINE_BATCH_SIZE-1)/2)): lines.insert(0, "") - lines.append("") + normlines.append("") # batch - for i in range(len(lines)-2): - r.append(lines[i:i+SOURCE_LINE_BATCH_SIZE]) + for i in range(len(lines)-1): + r.append([lines[i]] + normlines[i:i+SOURCE_LINE_BATCH_SIZE-1]) return r def source_to_np_array(source_batches : []) -> np.array: @@ -44,7 +48,8 @@ def read_acc(path : str) -> [[int]]: for line in file: try: l = eval(line) - l = l + [0] * (MAX_SHIMS - len(l)) + if len(l) < MAX_SHIMS: l = l + [0] * (MAX_SHIMS - len(l)) + else: l = l[:MAX_SHIMS] r.append(l) except: pass return r @@ -54,27 +59,28 @@ def whitespace_to_np_array(spaces : []) -> np.array: r = np.array(r).reshape(len(spaces), -1) return r -def compile_data(): +def compile_data(from_dir : str) -> {}: r = {'in': [], 'out': [], 'src': []} - for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")): - if n > MAX_DATA_LIMIT: break # XXX + for n, path in enumerate(glob(from_dir + "/*.c")): + if n > MAX_DATA_LIMIT: break acc_path = path + ".acc" norm_path = path + ".norm" - r['src'].append(path) - source_batches = get_source(norm_path) + source_batches = get_source(path, norm_path) accumulation = read_acc(acc_path) - assert len(source_batches) == len(accumulation), ( - f"Some retard fucked up strings in {path}." - ) + if len(source_batches) != len(accumulation): + print(f"WARNING: Some retard fucked up strings in {path}") + continue + r['src'].append(path) r['in'] += source_batches r['out'] += accumulation + print(f"INFO: Read data from ({n}) {path}") r['in'] = source_to_np_array(r['in']) r['out'] = whitespace_to_np_array(r['out']) return r -def get_data(): - r = [] - with open('dataset-linux.pkl', 'rb') as f: r = pickle.load(f) +def get_data(dataset_file : str) -> {}: + r = {} + with open(dataset_file, 'rb') as f: r = pickle.load(f) assert len(r['in']) == len(r['out']), ( "data in and out sizes were inconsistent (" + str(r['in'].shape) @@ -86,7 +92,6 @@ def get_data(): if __name__ == "__main__": if len(argv) == 2 and argv[1] == 'c': # clean compile - with open('dataset-linux.pkl', 'wb') as f: pickle.dump(compile_data(), f) - dataset = get_data() - print(dataset) + with open(DATASET_FILE, 'wb') as f: pickle.dump(compile_data(COMPILE_INPUT_DIRECTORY), f) + dataset = get_data(DATASET_FILE) print(dataset['in'].shape, dataset['out'].shape) diff --git a/formatter.py b/formatter.py index 41d458d..61083b8 100644 --- a/formatter.py +++ b/formatter.py @@ -10,11 +10,9 @@ import tard_wrangler if len(argv) > 1: mymodel = model.load_model(argv[1]) else: - dataset = data.get_data() + dataset = data.get_data("dataset-linux.pkl") mymodel = model.make_model(dataset) timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras") -predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel) -tard_wrangler.build("data/xop.c.norm", predictions) -tard_wrangler.cat_build() +print(tard_wrangler.full_predict("training_set/xop.c", "training_set/xop.c.norm", mymodel)) diff --git a/tard_wrangler.py b/tard_wrangler.py index c00d6f5..9c89e3c 100644 --- a/tard_wrangler.py +++ b/tard_wrangler.py @@ -1,29 +1,35 @@ import subprocess +import shlex import numpy as np from config import * import data -def accumulate(path : str, output : str) -> None: - process = subprocess.Popen( - "converter.out accumulate " + path + " > " + output, - shell=True, - ) +BUILD_FILE = "build_file.bin" -def full_predict(path : str, model) -> []: - r = [] - myinput = data.source_to_np_array(data.get_source(path)) - for i in myinput: - r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes() +def build(what : str, predictions : []) -> None: + print(predictions) + predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions]) + with open(BUILD_FILE, "wb") as f: f.write(predictions) + shell_what = shlex.quote(what) + shell_what = shell_what[0] + '^' + shell_what[1:] + process = subprocess.Popen( + "converter.out build " + shell_what + " " + BUILD_FILE, + shell=True, + stdout=subprocess.PIPE, + ) + r, _ = process.communicate() + r = r.decode('utf-8') return r -def build(path : str, predictions : []) -> None: - predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions]) - with open("build_file", "wb") as f: f.write(predictions) - process = subprocess.Popen( - "converter.out build " + path + " > out.c", - shell=True, - ) - -def cat_build(): - with open("out.c") as f: print(f.read()) +def full_predict(path : str, normpath : str, model) -> [str]: + r = ["\n"] + batches = data.get_source(path, normpath) + for b in batches: + b[0] = r[-1] + myinput = data.source_to_np_array([b]) + prediction = model.predict(myinput).astype(np.uint8).tobytes() + predicted_string = build(b[1], prediction) + r += predicted_string + "\n" + r = ''.join(r) + return r diff --git a/models/.gitkeep b/trained_models/.gitkeep similarity index 100% rename from models/.gitkeep rename to trained_models/.gitkeep diff --git a/data/assignments.list b/training_set/assignments.list similarity index 100% rename from data/assignments.list rename to training_set/assignments.list diff --git a/data/xop.c b/training_set/xop.c similarity index 100% rename from data/xop.c rename to training_set/xop.c