diff --git a/.gitignore b/.gitignore index 1f0f7a5..b0b4332 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ venv/ *.yy.* *.out +__pycache__/ diff --git a/config.py b/config.py new file mode 100644 index 0000000..816987c --- /dev/null +++ b/config.py @@ -0,0 +1,2 @@ +LINE_WIDTH = 80 +MAX_SHIMS = LINE_WIDTH - 1 diff --git a/data.py b/data.py index d4db29c..68fb6c8 100644 --- a/data.py +++ b/data.py @@ -1,43 +1,53 @@ -import re -from bidict import bidict +import subprocess +import numpy as np -#CHAR_TOKENS = bidict({ -# '': 0, -# '\n': 1, -#}) -#CHAR_TOKEN_OFFSET = 1 +from config import * -def encode(s : str) -> str: - return re.sub(r'\s+', ' ', s) +def get_data(): + r = [] + INPUT_FILE = "data/xop.c" + def get_source(path : str) -> [str]: + '''returns source file 3 line batches''' + r = [] + with open(path, 'r') as file: + lines = [] + for line in file: + lines.append(line.strip()) + r = [lines[i:i + 3] for i in range(0, len(lines), 3)] + return r + def source_to_np_array(source_batches : []) -> np.array: + r = [] + for s in source_batches: + ascii_list = [] + for l in s: + l = l[:LINE_WIDTH] + l = l.ljust(LINE_WIDTH) + l = [ord(i) for i in l] + ascii_list += l + n = np.reshape(ascii_list, (3, -1, 1)) + n = np.expand_dims(n, axis=0) + r.append(n) + return r + def get_whitespace(path : str) -> [int]: + '''XXX returns the whitespace list of every middle line''' + r = [] + output_file = "muf_file.txt" + process = subprocess.Popen( + "converter.out accumulate " + path + " > " + output_file, + shell=True, + ) + with open(output_file, 'r') as file: + for n, line in enumerate(file): + if ((n + 2) % 3) != 0: continue + r.append(eval(line)) + return r + source = source_to_np_array(get_source(INPUT_FILE)) + whitespace = get_whitespace(INPUT_FILE) + whitespace = [np.array(i) for i in whitespace] + r = {'in': source, 'out': whitespace} + assert len(r['in']) == len(r['in']), "data in and out sizes were inconsistent." + return r -#def decode(s : str, o : [int]) -> str: -# result = [] -# space_index = 0 -# for char in s: -# if char == ' ': -# if o[space_index] in CHAR_TOKENS.inverse: -# result.append(CHAR_TOKENS.inverse[o[space_index]]) -# else: -# result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET)) -# space_index += 1 -# else: -# result.append(char) -# return ''.join(result) - -def decode(s : str, o : [int]) -> str: - result = [] - space_index = 0 - for char in s: - if char == ' ': - result.append(' ' * (o[space_index]) - space_index += 1 - else: - result.append(char) - return ''.join(result) - -def batchificate(f): - BATCH_SIZE = 32 - s = open(f, 'r').read() - s = encode(s) - -print(decode(encode('if ( a == b ) { a = c )'), [2,0,2,2,0,1,0,4,1,1])) +if __name__ == "__main__": + dataset = get_data() + print(dataset) diff --git a/formatter.py b/formatter.py index f6b471e..dfd2b51 100644 --- a/formatter.py +++ b/formatter.py @@ -1,62 +1,15 @@ -import subprocess -import os import numpy as np +import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import tensorflow from tensorflow import keras from keras import layers -LINE_WIDTH = 80 -MAX_SHIMS = LINE_WIDTH - 1 +from config import * +import data - -def get_data(): - r = [] - def get_source(path : str) -> [str]: - '''returns source file 3 line batches''' - r = [] - with open(path, 'r') as file: - lines = [] - for line in file: - lines.append(line.strip()) - r = [lines[i:i + 3] for i in range(0, len(lines), 3)] - return r - def source_to_np_array(source_batches : []) -> np.array: - r = [] - for s in source_batches: - ascii_list = [] - for l in s: - l = l[:LINE_WIDTH] - l = l.ljust(LINE_WIDTH) - l = [ord(i) for i in l] - ascii_list += l - n = np.reshape(ascii_list, (3, -1, 1)) - n = np.expand_dims(n, axis=0) - r.append(n) - return r - def get_whitespace(path : str) -> [int]: - '''XXX returns the whitespace list of every middle line''' - r = [] - output_file = "muf_file.txt" - process = subprocess.Popen( - "converter.out accumulate " + path + " > " + output_file, - shell=True, - ) - with open(output_file, 'r') as file: - for n, line in enumerate(file): - if ((n + 2) % 3) != 0: continue - r.append(eval(line)) - return r - source = source_to_np_array(get_source("in/xop.c")) - whitespace = get_whitespace("in/xop.c") - whitespace = [np.array(i) for i in whitespace] - r = {'in': source, 'out': whitespace} - return r - -data = get_data() -assert len(data['in']) == len(data['in']), "data in and out sizes were inconsistent." -print(data['in'], data['out']) +dataset = data.get_data() model = keras.Sequential([ layers.Conv2D( @@ -90,7 +43,7 @@ model.compile( metrics=['accuracy'] ) -model.fit(data['in'], data['out'], +model.fit(dataset['in'], dataset['out'], verbose=2, batch_size=10, epochs=50,