From 27338a3481169379bae6c975fda4013f26f2f654 Mon Sep 17 00:00:00 2001
From: anon <anon@anon.anon>
Date: Sun, 6 Oct 2024 21:48:24 +0200
Subject: [PATCH] getting somewhere i swear

---
 .gitignore   |   2 +
 README.md    |   4 ++
 config.py    |   3 ++
 converter.l  |  12 ++----
 data.py      | 104 ++++++++++++++++++++++++++++++---------------------
 formatter.py |   1 +
 6 files changed, 75 insertions(+), 51 deletions(-)
 create mode 100644 README.md

diff --git a/.gitignore b/.gitignore
index 4f5520f..9c6ac94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ venv/
 *.out
 __pycache__/
 *.norm
+data/linux/
+*.pkl
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..569b6dd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,4 @@
+# NOTES
++ we have a problem on nuking system includes;
+this fucks with trying to be language agnostic;
+i wonder if hopefully the AI can just realize theres never spaces there
diff --git a/config.py b/config.py
index 816987c..83c6c42 100644
--- a/config.py
+++ b/config.py
@@ -1,2 +1,5 @@
 LINE_WIDTH = 80
 MAX_SHIMS  = LINE_WIDTH - 1
+SOURCE_LINE_BATCH_SIZE = 3
+
+COMPILE_INPUT_DIRECTORY = "data/linux/"
diff --git a/converter.l b/converter.l
index 607dc09..6adbfa6 100644
--- a/converter.l
+++ b/converter.l
@@ -4,12 +4,6 @@
    @STOP
  */
 %{
-    /* NOTE: this shall be compiled as a shared library so python may call in
-    */
-    /* XXX: we have a problem on nuking system includes;
-             this fucks with trying to be language agnostic;
-             i wonder if hopefully the AI can just realize theres never spaces there
-    */
     #include <stdio.h>
     #include <stdbool.h>
 
@@ -23,10 +17,10 @@
     int accumulator = 0;
 
     FILE * build_file;
-    int schemantic[MAX_SHIMS];
+    char schemantic[MAX_SHIMS];
     int schim = 0;
 
-    #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, 1, build_file)
+    #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file)
     #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
 
     #define EOL '\n'
@@ -108,7 +102,7 @@ special {comment_marker}|{assignment}|{shift}|{modify}
 [ ]|\t      { ; }
 {word}|.    {
                 ECHO;
-                for (int i = 0; i < schemantic[schim]; i++) {
+                for (char i = 0; i < schemantic[schim]; i++) {
                     ECHOS(" ");
                 }
                 ++schim;
diff --git a/data.py b/data.py
index d330a52..7b76b8e 100644
--- a/data.py
+++ b/data.py
@@ -1,53 +1,71 @@
+from glob import glob
 import numpy as np
+import pickle
+from sys import argv
 
 from config import *
 import tard_wrangler
 
-def get_data():
+def get_source(path : str) -> [str]:
+	'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
 	r = []
-	INPUT_FILE = "data/xop.c"
-	def get_source(path : str) -> [str]:
-		'''returns source file 3 line batches'''
-		r = []
-		with open(path, 'r') as file: lines = [line[:-1] for line in file]
+	# read data
+	with open(path, 'r') as file: lines = [line[:-1] for line in file]
+	# pad with empty lines
+	for i in range(int((SOURCE_LINE_BATCH_SIZE-1)/2)):
 		lines.insert(0, "")
 		lines.append("")
-		for i in range(len(lines)-2):
-			r.append(lines[i:i+3])
-		return r
-	def source_to_np_array(source_batches : []) -> np.array:
-		r = []
-		for s in source_batches:
-			ascii_list = []
-			for l in s:
-				l = l[:LINE_WIDTH]
-				l = l.ljust(LINE_WIDTH)
-				l = [ord(i) for i in l]
-				ascii_list += l
-			n = np.reshape(ascii_list, (3, -1, 1))
-			r.append(n)
-		r = np.array(r)
-		return r
-	def get_whitespace(path : str) -> [int]:
-		'''XXX returns the whitespace list of every middle line'''
-		r = []
-		output = "muf_file.txt"
-		tard_wrangler.accumulate(INPUT_FILE, output)
-		with open(output, 'r') as file:
-			for line in file:
-				try:
-					l = eval(line)
-					l = l + [0] * (MAX_SHIMS - len(l))
-					r.append(l)
-				except: pass
-		return r
-	def whitespace_to_np_array(spaces : []) -> np.array:
-		r = spaces
-		r = np.array(r).reshape(len(spaces), -1)
-		return r
-	source = source_to_np_array(get_source(INPUT_FILE))
-	whitespace = whitespace_to_np_array(get_whitespace(INPUT_FILE))
-	r = {'in': source, 'out': whitespace}
+	# batch
+	for i in range(len(lines)-2):
+		r.append(lines[i:i+SOURCE_LINE_BATCH_SIZE])
+	return r
+
+def source_to_np_array(source_batches : []) -> np.array:
+	'''returns image like array from batches'''
+	r = []
+	for s in source_batches:
+		ascii_list = []
+		for l in s:
+			l = l[:LINE_WIDTH]			# cut long lines
+			l = l.ljust(LINE_WIDTH)		# pad short lines
+			l = [ord(i) for i in l]
+			ascii_list += l
+		n = np.reshape(ascii_list, (3, -1, 1))
+		r.append(n)
+	r = np.array(r)
+	return r
+
+def read_acc(path : str) -> [[int]]:
+	r = []
+	with open(path, 'r') as file:
+		for line in file:
+			try:
+				l = eval(line)
+				l = l + [0] * (MAX_SHIMS - len(l))
+				r.append(l)
+			except: pass
+	return r
+
+def whitespace_to_np_array(spaces : []) -> np.array:
+	r = spaces
+	r = np.array(r).reshape(len(spaces), -1)
+	return r
+
+def compile_data():
+	r = {'in': [], 'out': [], 'src': []}
+	for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
+		if n > 47: break # XXX
+		acc_path = path + ".acc"
+		r['src'].append(path)
+		r['in']  += get_source(path)
+		r['out'] += read_acc(acc_path)
+	r['in']  = source_to_np_array(r['in'])
+	r['out'] = whitespace_to_np_array(r['out'])
+	return r
+
+def get_data():
+	r = []
+	with open('dataset-linux.pkl', 'rb') as f: r = pickle.load(f)
 	assert len(r['in']) == len(r['out']), (
 			"data in and out sizes were inconsistent ("
 			+ str(r['in'].shape)
@@ -58,6 +76,8 @@ def get_data():
 	return r
 
 if __name__ == "__main__":
+	if len(argv) == 2 and argv[1] == 'c': # clean compile
+		with open('dataset-linux.pkl', 'wb') as f: pickle.dump(compile_data(), f)
 	dataset = get_data()
 	print(dataset)
 	print(dataset['in'].shape, dataset['out'].shape)
diff --git a/formatter.py b/formatter.py
index b8c2f80..9a31e1c 100644
--- a/formatter.py
+++ b/formatter.py
@@ -24,6 +24,7 @@ model = keras.Sequential([
 	),
 	layers.Flatten(),
 	layers.Dense(64, activation='relu'),
+	layers.Dense(64, activation='relu'),
 	layers.Dense(MAX_SHIMS) #activation='softmax'
 ])