getting somewhere i swear

This commit is contained in:
anon 2024-10-06 21:48:24 +02:00
parent 95c847b9a1
commit 27338a3481
6 changed files with 75 additions and 51 deletions

2
.gitignore vendored
View File

@ -3,3 +3,5 @@ venv/
*.out *.out
__pycache__/ __pycache__/
*.norm *.norm
data/linux/
*.pkl

4
README.md Normal file
View File

@ -0,0 +1,4 @@
# NOTES
+ we have a problem on nuking system includes;
this fucks with trying to be language agnostic;
i wonder if hopefully the AI can just realize theres never spaces there

View File

@ -1,2 +1,5 @@
LINE_WIDTH = 80 LINE_WIDTH = 80
MAX_SHIMS = LINE_WIDTH - 1 MAX_SHIMS = LINE_WIDTH - 1
SOURCE_LINE_BATCH_SIZE = 3
COMPILE_INPUT_DIRECTORY = "data/linux/"

View File

@ -4,12 +4,6 @@
@STOP @STOP
*/ */
%{ %{
/* NOTE: this shall be compiled as a shared library so python may call in
*/
/* XXX: we have a problem on nuking system includes;
this fucks with trying to be language agnostic;
i wonder if hopefully the AI can just realize theres never spaces there
*/
#include <stdio.h> #include <stdio.h>
#include <stdbool.h> #include <stdbool.h>
@ -23,10 +17,10 @@
int accumulator = 0; int accumulator = 0;
FILE * build_file; FILE * build_file;
int schemantic[MAX_SHIMS]; char schemantic[MAX_SHIMS];
int schim = 0; int schim = 0;
#define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, 1, build_file) #define STEP_SCHEMANTIC fread(schemantic, MAX_SHIMS, sizeof(char), build_file)
#define ECHOS(s) fwrite(s, strlen(s), 1, yyout) #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
#define EOL '\n' #define EOL '\n'
@ -108,7 +102,7 @@ special {comment_marker}|{assignment}|{shift}|{modify}
[ ]|\t { ; } [ ]|\t { ; }
{word}|. { {word}|. {
ECHO; ECHO;
for (int i = 0; i < schemantic[schim]; i++) { for (char i = 0; i < schemantic[schim]; i++) {
ECHOS(" "); ECHOS(" ");
} }
++schim; ++schim;

104
data.py
View File

@ -1,53 +1,71 @@
from glob import glob
import numpy as np import numpy as np
import pickle
from sys import argv
from config import * from config import *
import tard_wrangler import tard_wrangler
def get_data(): def get_source(path : str) -> [str]:
'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
r = [] r = []
INPUT_FILE = "data/xop.c" # read data
def get_source(path : str) -> [str]: with open(path, 'r') as file: lines = [line[:-1] for line in file]
'''returns source file 3 line batches''' # pad with empty lines
r = [] for i in range(int((SOURCE_LINE_BATCH_SIZE-1)/2)):
with open(path, 'r') as file: lines = [line[:-1] for line in file]
lines.insert(0, "") lines.insert(0, "")
lines.append("") lines.append("")
for i in range(len(lines)-2): # batch
r.append(lines[i:i+3]) for i in range(len(lines)-2):
return r r.append(lines[i:i+SOURCE_LINE_BATCH_SIZE])
def source_to_np_array(source_batches : []) -> np.array: return r
r = []
for s in source_batches: def source_to_np_array(source_batches : []) -> np.array:
ascii_list = [] '''returns image like array from batches'''
for l in s: r = []
l = l[:LINE_WIDTH] for s in source_batches:
l = l.ljust(LINE_WIDTH) ascii_list = []
l = [ord(i) for i in l] for l in s:
ascii_list += l l = l[:LINE_WIDTH] # cut long lines
n = np.reshape(ascii_list, (3, -1, 1)) l = l.ljust(LINE_WIDTH) # pad short lines
r.append(n) l = [ord(i) for i in l]
r = np.array(r) ascii_list += l
return r n = np.reshape(ascii_list, (3, -1, 1))
def get_whitespace(path : str) -> [int]: r.append(n)
'''XXX returns the whitespace list of every middle line''' r = np.array(r)
r = [] return r
output = "muf_file.txt"
tard_wrangler.accumulate(INPUT_FILE, output) def read_acc(path : str) -> [[int]]:
with open(output, 'r') as file: r = []
for line in file: with open(path, 'r') as file:
try: for line in file:
l = eval(line) try:
l = l + [0] * (MAX_SHIMS - len(l)) l = eval(line)
r.append(l) l = l + [0] * (MAX_SHIMS - len(l))
except: pass r.append(l)
return r except: pass
def whitespace_to_np_array(spaces : []) -> np.array: return r
r = spaces
r = np.array(r).reshape(len(spaces), -1) def whitespace_to_np_array(spaces : []) -> np.array:
return r r = spaces
source = source_to_np_array(get_source(INPUT_FILE)) r = np.array(r).reshape(len(spaces), -1)
whitespace = whitespace_to_np_array(get_whitespace(INPUT_FILE)) return r
r = {'in': source, 'out': whitespace}
def compile_data():
r = {'in': [], 'out': [], 'src': []}
for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
if n > 47: break # XXX
acc_path = path + ".acc"
r['src'].append(path)
r['in'] += get_source(path)
r['out'] += read_acc(acc_path)
r['in'] = source_to_np_array(r['in'])
r['out'] = whitespace_to_np_array(r['out'])
return r
def get_data():
r = []
with open('dataset-linux.pkl', 'rb') as f: r = pickle.load(f)
assert len(r['in']) == len(r['out']), ( assert len(r['in']) == len(r['out']), (
"data in and out sizes were inconsistent (" "data in and out sizes were inconsistent ("
+ str(r['in'].shape) + str(r['in'].shape)
@ -58,6 +76,8 @@ def get_data():
return r return r
if __name__ == "__main__": if __name__ == "__main__":
if len(argv) == 2 and argv[1] == 'c': # clean compile
with open('dataset-linux.pkl', 'wb') as f: pickle.dump(compile_data(), f)
dataset = get_data() dataset = get_data()
print(dataset) print(dataset)
print(dataset['in'].shape, dataset['out'].shape) print(dataset['in'].shape, dataset['out'].shape)

View File

@ -24,6 +24,7 @@ model = keras.Sequential([
), ),
layers.Flatten(), layers.Flatten(),
layers.Dense(64, activation='relu'), layers.Dense(64, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(MAX_SHIMS) #activation='softmax' layers.Dense(MAX_SHIMS) #activation='softmax'
]) ])