From 27338a3481169379bae6c975fda4013f26f2f654 Mon Sep 17 00:00:00 2001 From: anon Date: Sun, 6 Oct 2024 21:48:24 +0200 Subject: [PATCH] getting somewhere i swear --- .gitignore | 2 + README.md | 4 ++ config.py | 3 ++ converter.l | 12 ++---- data.py | 104 ++++++++++++++++++++++++++++++--------------------- formatter.py | 1 + 6 files changed, 75 insertions(+), 51 deletions(-) create mode 100644 README.md diff --git a/.gitignore b/.gitignore index 4f5520f..9c6ac94 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ venv/ *.out __pycache__/ *.norm +data/linux/ +*.pkl diff --git a/README.md b/README.md new file mode 100644 index 0000000..569b6dd --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# NOTES ++ we have a problem on nuking system includes; +this fucks with trying to be language agnostic; +i wonder if hopefully the AI can just realize theres never spaces there diff --git a/config.py b/config.py index 816987c..83c6c42 100644 --- a/config.py +++ b/config.py @@ -1,2 +1,5 @@ LINE_WIDTH = 80 MAX_SHIMS = LINE_WIDTH - 1 +SOURCE_LINE_BATCH_SIZE = 3 + +COMPILE_INPUT_DIRECTORY = "data/linux/" diff --git a/converter.l b/converter.l index 607dc09..6adbfa6 100644 --- a/converter.l +++ b/converter.l @@ -4,12 +4,6 @@ @STOP */ %{ - /* NOTE: this shall be compiled as a shared library so python may call in - */ - /* XXX: we have a problem on nuking system includes; - this fucks with trying to be language agnostic; - i wonder if hopefully the AI can just realize theres never spaces there - */ #include #include @@ -23,10 +17,10 @@ int accumulator = 0; FILE * build_file; - int schemantic[MAX_SHIMS]; + char schemantic[MAX_SHIMS]; int schim = 0; - #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, 1, build_file) + #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file) #define ECHOS(s) fwrite(s, strlen(s), 1, yyout) #define EOL '\n' @@ -108,7 +102,7 @@ special {comment_marker}|{assignment}|{shift}|{modify} [ ]|\t { ; } {word}|. { ECHO; - for (int i = 0; i < schemantic[schim]; i++) { + for (char i = 0; i < schemantic[schim]; i++) { ECHOS(" "); } ++schim; diff --git a/data.py b/data.py index d330a52..7b76b8e 100644 --- a/data.py +++ b/data.py @@ -1,53 +1,71 @@ +from glob import glob import numpy as np +import pickle +from sys import argv from config import * import tard_wrangler -def get_data(): +def get_source(path : str) -> [str]: + '''returns source file in $SOURCE_LINE_BATCH_SIZE line batches''' r = [] - INPUT_FILE = "data/xop.c" - def get_source(path : str) -> [str]: - '''returns source file 3 line batches''' - r = [] - with open(path, 'r') as file: lines = [line[:-1] for line in file] + # read data + with open(path, 'r') as file: lines = [line[:-1] for line in file] + # pad with empty lines + for i in range(int((SOURCE_LINE_BATCH_SIZE-1)/2)): lines.insert(0, "") lines.append("") - for i in range(len(lines)-2): - r.append(lines[i:i+3]) - return r - def source_to_np_array(source_batches : []) -> np.array: - r = [] - for s in source_batches: - ascii_list = [] - for l in s: - l = l[:LINE_WIDTH] - l = l.ljust(LINE_WIDTH) - l = [ord(i) for i in l] - ascii_list += l - n = np.reshape(ascii_list, (3, -1, 1)) - r.append(n) - r = np.array(r) - return r - def get_whitespace(path : str) -> [int]: - '''XXX returns the whitespace list of every middle line''' - r = [] - output = "muf_file.txt" - tard_wrangler.accumulate(INPUT_FILE, output) - with open(output, 'r') as file: - for line in file: - try: - l = eval(line) - l = l + [0] * (MAX_SHIMS - len(l)) - r.append(l) - except: pass - return r - def whitespace_to_np_array(spaces : []) -> np.array: - r = spaces - r = np.array(r).reshape(len(spaces), -1) - return r - source = source_to_np_array(get_source(INPUT_FILE)) - whitespace = whitespace_to_np_array(get_whitespace(INPUT_FILE)) - r = {'in': source, 'out': whitespace} + # batch + for i in range(len(lines)-2): + r.append(lines[i:i+SOURCE_LINE_BATCH_SIZE]) + return r + +def source_to_np_array(source_batches : []) -> np.array: + '''returns image like array from batches''' + r = [] + for s in source_batches: + ascii_list = [] + for l in s: + l = l[:LINE_WIDTH] # cut long lines + l = l.ljust(LINE_WIDTH) # pad short lines + l = [ord(i) for i in l] + ascii_list += l + n = np.reshape(ascii_list, (3, -1, 1)) + r.append(n) + r = np.array(r) + return r + +def read_acc(path : str) -> [[int]]: + r = [] + with open(path, 'r') as file: + for line in file: + try: + l = eval(line) + l = l + [0] * (MAX_SHIMS - len(l)) + r.append(l) + except: pass + return r + +def whitespace_to_np_array(spaces : []) -> np.array: + r = spaces + r = np.array(r).reshape(len(spaces), -1) + return r + +def compile_data(): + r = {'in': [], 'out': [], 'src': []} + for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")): + if n > 47: break # XXX + acc_path = path + ".acc" + r['src'].append(path) + r['in'] += get_source(path) + r['out'] += read_acc(acc_path) + r['in'] = source_to_np_array(r['in']) + r['out'] = whitespace_to_np_array(r['out']) + return r + +def get_data(): + r = [] + with open('dataset-linux.pkl', 'rb') as f: r = pickle.load(f) assert len(r['in']) == len(r['out']), ( "data in and out sizes were inconsistent (" + str(r['in'].shape) @@ -58,6 +76,8 @@ def get_data(): return r if __name__ == "__main__": + if len(argv) == 2 and argv[1] == 'c': # clean compile + with open('dataset-linux.pkl', 'wb') as f: pickle.dump(compile_data(), f) dataset = get_data() print(dataset) print(dataset['in'].shape, dataset['out'].shape) diff --git a/formatter.py b/formatter.py index b8c2f80..9a31e1c 100644 --- a/formatter.py +++ b/formatter.py @@ -24,6 +24,7 @@ model = keras.Sequential([ ), layers.Flatten(), layers.Dense(64, activation='relu'), + layers.Dense(64, activation='relu'), layers.Dense(MAX_SHIMS) #activation='softmax' ])