cleant up

2024-10-02 20:00:27 +02:00
parent ee64b3aa4a
commit b696ef3dd0
4 changed files with 58 additions and 92 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 venv/
 *.yy.*
 *.out
 __pycache__/
--- a/config.py
+++ b/config.py
@ -0,0 +1,2 @@
 LINE_WIDTH = 80
 MAX_SHIMS  = LINE_WIDTH - 1
--- a/data.py
+++ b/data.py
@ -1,43 +1,53 @@
-import re
+import subprocess
-from bidict import bidict
+import numpy as np
-#CHAR_TOKENS = bidict({
+from config import *
 #	'':   0,
 #	'\n': 1,
 #})
 #CHAR_TOKEN_OFFSET = 1
-def encode(s : str) -> str:
+def get_data():
-    return re.sub(r'\s+', ' ', s)
+	r = []
 	INPUT_FILE = "data/xop.c"
 	def get_source(path : str) -> [str]:
 		'''returns source file 3 line batches'''
 		r = []
 		with open(path, 'r') as file:
 			lines = []
 			for line in file:
 				lines.append(line.strip())
 			r = [lines[i:i + 3] for i in range(0, len(lines), 3)]
 		return r
 	def source_to_np_array(source_batches : []) -> np.array:
 		r = []
 		for s in source_batches:
 			ascii_list = []
 			for l in s:
 				l = l[:LINE_WIDTH]
 				l = l.ljust(LINE_WIDTH)
 				l = [ord(i) for i in l]
 				ascii_list += l
 			n = np.reshape(ascii_list, (3, -1, 1))
 			n = np.expand_dims(n, axis=0)
 			r.append(n)
 		return r
 	def get_whitespace(path : str) -> [int]:
 		'''XXX returns the whitespace list of every middle line'''
 		r = []
 		output_file = "muf_file.txt"
 		process = subprocess.Popen(
 					"converter.out accumulate " + path + " > " + output_file,
 					shell=True,
 		)
 		with open(output_file, 'r') as file:
 			for n, line in enumerate(file):
 				if ((n + 2) % 3) != 0: continue
 				r.append(eval(line))
 		return r
 	source = source_to_np_array(get_source(INPUT_FILE))
 	whitespace = get_whitespace(INPUT_FILE)
 	whitespace = [np.array(i) for i in whitespace]
 	r = {'in': source, 'out': whitespace}
 	assert len(r['in']) == len(r['in']), "data in and out sizes were inconsistent."
 	return r
-#def decode(s : str,  o : [int]) -> str:
+if __name__ == "__main__":
-#    result = []
+	dataset = get_data()
-#    space_index = 0
+	print(dataset)
 #    for char in s:
 #        if char == ' ':
 #            if o[space_index] in CHAR_TOKENS.inverse:
 #                result.append(CHAR_TOKENS.inverse[o[space_index]])
 #            else:
 #                result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET))
 #            space_index += 1
 #        else:
 #            result.append(char)
 #    return ''.join(result)
 def decode(s : str,  o : [int]) -> str:
    result = []
    space_index = 0
    for char in s:
        if char == ' ':
                result.append(' ' * (o[space_index])
            space_index += 1
        else:
            result.append(char)
    return ''.join(result)
 def batchificate(f):
 	BATCH_SIZE = 32
 	s = open(f, 'r').read()
 	s = encode(s)
 print(decode(encode('if ( a  == b ) {   a = c   )'), [2,0,2,2,0,1,0,4,1,1]))
--- a/formatter.py
+++ b/formatter.py
@ -1,62 +1,15 @@
 import subprocess
 import os
 import numpy as np
 import os
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import tensorflow
 from tensorflow import keras
 from keras import layers
-LINE_WIDTH = 80
+from config import *
-MAX_SHIMS  = LINE_WIDTH - 1
+import data
-
+dataset = data.get_data()
 def get_data():
 	r = []
 	def get_source(path : str) -> [str]:
 		'''returns source file 3 line batches'''
 		r = []
 		with open(path, 'r') as file:
 			lines = []
 			for line in file:
 				lines.append(line.strip())
 			r = [lines[i:i + 3] for i in range(0, len(lines), 3)]
 		return r
 	def source_to_np_array(source_batches : []) -> np.array:
 		r = []
 		for s in source_batches:
 			ascii_list = []
 			for l in s:
 				l = l[:LINE_WIDTH]
 				l = l.ljust(LINE_WIDTH)
 				l = [ord(i) for i in l]
 				ascii_list += l
 			n = np.reshape(ascii_list, (3, -1, 1))
 			n = np.expand_dims(n, axis=0)
 			r.append(n)
 		return r
 	def get_whitespace(path : str) -> [int]:
 		'''XXX returns the whitespace list of every middle line'''
 		r = []
 		output_file = "muf_file.txt"
 		process = subprocess.Popen(
 					"converter.out accumulate " + path + " > " + output_file,
 					shell=True,
 		)
 		with open(output_file, 'r') as file:
 			for n, line in enumerate(file):
 				if ((n + 2) % 3) != 0: continue
 				r.append(eval(line))
 		return r
 	source = source_to_np_array(get_source("in/xop.c"))
 	whitespace = get_whitespace("in/xop.c")
 	whitespace = [np.array(i) for i in whitespace]
 	r = {'in': source, 'out': whitespace}
 	return r
 data = get_data()
 assert len(data['in']) == len(data['in']), "data in and out sizes were inconsistent."
 print(data['in'], data['out'])
 model = keras.Sequential([
 	layers.Conv2D(
@ -90,7 +43,7 @@ model.compile(
 	metrics=['accuracy']
 )
-model.fit(data['in'], data['out'],
+model.fit(dataset['in'], dataset['out'],
    verbose=2,
    batch_size=10,
    epochs=50,
		`@ -0,0 +1,2 @@`
							`LINE_WIDTH = 80`
							`MAX_SHIMS = LINE_WIDTH - 1`