cleant up

This commit is contained in:
anon 2024-10-02 20:00:27 +02:00
parent ee64b3aa4a
commit b696ef3dd0
4 changed files with 58 additions and 92 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
venv/
*.yy.*
*.out
__pycache__/

2
config.py Normal file
View File

@ -0,0 +1,2 @@
LINE_WIDTH = 80
MAX_SHIMS = LINE_WIDTH - 1

90
data.py
View File

@ -1,43 +1,53 @@
import re
from bidict import bidict
import subprocess
import numpy as np
#CHAR_TOKENS = bidict({
# '': 0,
# '\n': 1,
#})
#CHAR_TOKEN_OFFSET = 1
from config import *
def encode(s : str) -> str:
return re.sub(r'\s+', ' ', s)
def get_data():
r = []
INPUT_FILE = "data/xop.c"
def get_source(path : str) -> [str]:
'''returns source file 3 line batches'''
r = []
with open(path, 'r') as file:
lines = []
for line in file:
lines.append(line.strip())
r = [lines[i:i + 3] for i in range(0, len(lines), 3)]
return r
def source_to_np_array(source_batches : []) -> np.array:
r = []
for s in source_batches:
ascii_list = []
for l in s:
l = l[:LINE_WIDTH]
l = l.ljust(LINE_WIDTH)
l = [ord(i) for i in l]
ascii_list += l
n = np.reshape(ascii_list, (3, -1, 1))
n = np.expand_dims(n, axis=0)
r.append(n)
return r
def get_whitespace(path : str) -> [int]:
'''XXX returns the whitespace list of every middle line'''
r = []
output_file = "muf_file.txt"
process = subprocess.Popen(
"converter.out accumulate " + path + " > " + output_file,
shell=True,
)
with open(output_file, 'r') as file:
for n, line in enumerate(file):
if ((n + 2) % 3) != 0: continue
r.append(eval(line))
return r
source = source_to_np_array(get_source(INPUT_FILE))
whitespace = get_whitespace(INPUT_FILE)
whitespace = [np.array(i) for i in whitespace]
r = {'in': source, 'out': whitespace}
assert len(r['in']) == len(r['in']), "data in and out sizes were inconsistent."
return r
#def decode(s : str, o : [int]) -> str:
# result = []
# space_index = 0
# for char in s:
# if char == ' ':
# if o[space_index] in CHAR_TOKENS.inverse:
# result.append(CHAR_TOKENS.inverse[o[space_index]])
# else:
# result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET))
# space_index += 1
# else:
# result.append(char)
# return ''.join(result)
def decode(s : str, o : [int]) -> str:
result = []
space_index = 0
for char in s:
if char == ' ':
result.append(' ' * (o[space_index])
space_index += 1
else:
result.append(char)
return ''.join(result)
def batchificate(f):
BATCH_SIZE = 32
s = open(f, 'r').read()
s = encode(s)
print(decode(encode('if ( a == b ) { a = c )'), [2,0,2,2,0,1,0,4,1,1]))
if __name__ == "__main__":
dataset = get_data()
print(dataset)

View File

@ -1,62 +1,15 @@
import subprocess
import os
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow
from tensorflow import keras
from keras import layers
LINE_WIDTH = 80
MAX_SHIMS = LINE_WIDTH - 1
from config import *
import data
def get_data():
r = []
def get_source(path : str) -> [str]:
'''returns source file 3 line batches'''
r = []
with open(path, 'r') as file:
lines = []
for line in file:
lines.append(line.strip())
r = [lines[i:i + 3] for i in range(0, len(lines), 3)]
return r
def source_to_np_array(source_batches : []) -> np.array:
r = []
for s in source_batches:
ascii_list = []
for l in s:
l = l[:LINE_WIDTH]
l = l.ljust(LINE_WIDTH)
l = [ord(i) for i in l]
ascii_list += l
n = np.reshape(ascii_list, (3, -1, 1))
n = np.expand_dims(n, axis=0)
r.append(n)
return r
def get_whitespace(path : str) -> [int]:
'''XXX returns the whitespace list of every middle line'''
r = []
output_file = "muf_file.txt"
process = subprocess.Popen(
"converter.out accumulate " + path + " > " + output_file,
shell=True,
)
with open(output_file, 'r') as file:
for n, line in enumerate(file):
if ((n + 2) % 3) != 0: continue
r.append(eval(line))
return r
source = source_to_np_array(get_source("in/xop.c"))
whitespace = get_whitespace("in/xop.c")
whitespace = [np.array(i) for i in whitespace]
r = {'in': source, 'out': whitespace}
return r
data = get_data()
assert len(data['in']) == len(data['in']), "data in and out sizes were inconsistent."
print(data['in'], data['out'])
dataset = data.get_data()
model = keras.Sequential([
layers.Conv2D(
@ -90,7 +43,7 @@ model.compile(
metrics=['accuracy']
)
model.fit(data['in'], data['out'],
model.fit(dataset['in'], dataset['out'],
verbose=2,
batch_size=10,
epochs=50,