inching closer to the truth
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,7 +1,8 @@
|
||||
venv/
|
||||
*.yy.*
|
||||
*.out
|
||||
*.bin
|
||||
__pycache__/
|
||||
*.norm
|
||||
data/linux/
|
||||
training_set/linux/
|
||||
*.pkl
|
||||
|
@ -2,5 +2,5 @@ LINE_WIDTH = 80
|
||||
MAX_SHIMS = LINE_WIDTH - 1
|
||||
SOURCE_LINE_BATCH_SIZE = 3
|
||||
|
||||
COMPILE_INPUT_DIRECTORY = "data/linux/"
|
||||
MODEL_DIRECTORY = "models/"
|
||||
COMPILE_INPUT_DIRECTORY = "training_set/linux/"
|
||||
MODEL_DIRECTORY = "trained_models/"
|
||||
|
27
converter.l
27
converter.l
@ -144,7 +144,12 @@ special {comment_marker}|{assignment}|{shift}|{modify}
|
||||
|
||||
signed main(const int argc, const char * const * const argv) {
|
||||
if (argc < 3) {
|
||||
puts("Usage: converter <mode> <file>");
|
||||
puts(
|
||||
"Usage:\n"
|
||||
"$ converter normalize [<file>|^<string>]\n"
|
||||
"$ converter accumulate [<file>|^<string>]\n"
|
||||
"$ converter build [<file>|^<string>] <schemantic-file>\n"
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -155,17 +160,33 @@ signed main(const int argc, const char * const * const argv) {
|
||||
mystate = ACCUMULATE;
|
||||
} else
|
||||
if (!strcmp(argv[1], "build")) {
|
||||
if (argc < 4) { exit(4); }
|
||||
mystate = BUILD;
|
||||
build_file = fopen("build_file", "rb");
|
||||
build_file = fopen(argv[3], "rb");
|
||||
if (!build_file) { exit(1); }
|
||||
STEP_SCHEMANTIC;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
|
||||
yyin = fopen(argv[2], "r");
|
||||
char * input;
|
||||
if (argv[2][0] == '^') {
|
||||
input = (char*)argv[2]+1;
|
||||
} else {
|
||||
FILE * f = fopen(argv[2], "r");
|
||||
if(!f){ exit(3); }
|
||||
fseek(f, 0, SEEK_END);
|
||||
int flen = ftell(f);
|
||||
rewind(f);
|
||||
input = malloc(flen+1);
|
||||
input[flen] = '\00';
|
||||
fread(input, flen, sizeof(char), f);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
YY_BUFFER_STATE const b = yy_scan_string(input);
|
||||
yylex();
|
||||
yy_delete_buffer(b);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
47
data.py
47
data.py
@ -7,20 +7,24 @@ from sys import argv
|
||||
from config import *
|
||||
import tard_wrangler
|
||||
|
||||
MAX_DATA_LIMIT = sys.maxsize
|
||||
#MAX_DATA_LIMIT = sys.maxsize
|
||||
MAX_DATA_LIMIT = 1000
|
||||
|
||||
def get_source(path : str) -> [str]:
|
||||
DATASET_FILE = "training_set/dataset-linux.pkl"
|
||||
|
||||
def get_source(path : str, normpath : str) -> [str]:
|
||||
'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
|
||||
r = []
|
||||
# read data
|
||||
with open(path, 'r') as file: lines = [line[:-1] for line in file]
|
||||
with open(path, 'r') as f: lines = [line[:-1] for line in f]
|
||||
with open(normpath, 'r') as f: normlines = [line[:-1] for line in f]
|
||||
# pad with empty lines
|
||||
for i in range(int((SOURCE_LINE_BATCH_SIZE-1)/2)):
|
||||
lines.insert(0, "")
|
||||
lines.append("")
|
||||
normlines.append("")
|
||||
# batch
|
||||
for i in range(len(lines)-2):
|
||||
r.append(lines[i:i+SOURCE_LINE_BATCH_SIZE])
|
||||
for i in range(len(lines)-1):
|
||||
r.append([lines[i]] + normlines[i:i+SOURCE_LINE_BATCH_SIZE-1])
|
||||
return r
|
||||
|
||||
def source_to_np_array(source_batches : []) -> np.array:
|
||||
@ -44,7 +48,8 @@ def read_acc(path : str) -> [[int]]:
|
||||
for line in file:
|
||||
try:
|
||||
l = eval(line)
|
||||
l = l + [0] * (MAX_SHIMS - len(l))
|
||||
if len(l) < MAX_SHIMS: l = l + [0] * (MAX_SHIMS - len(l))
|
||||
else: l = l[:MAX_SHIMS]
|
||||
r.append(l)
|
||||
except: pass
|
||||
return r
|
||||
@ -54,27 +59,28 @@ def whitespace_to_np_array(spaces : []) -> np.array:
|
||||
r = np.array(r).reshape(len(spaces), -1)
|
||||
return r
|
||||
|
||||
def compile_data():
|
||||
def compile_data(from_dir : str) -> {}:
|
||||
r = {'in': [], 'out': [], 'src': []}
|
||||
for n, path in enumerate(glob(COMPILE_INPUT_DIRECTORY + "/*.c")):
|
||||
if n > MAX_DATA_LIMIT: break # XXX
|
||||
for n, path in enumerate(glob(from_dir + "/*.c")):
|
||||
if n > MAX_DATA_LIMIT: break
|
||||
acc_path = path + ".acc"
|
||||
norm_path = path + ".norm"
|
||||
r['src'].append(path)
|
||||
source_batches = get_source(norm_path)
|
||||
source_batches = get_source(path, norm_path)
|
||||
accumulation = read_acc(acc_path)
|
||||
assert len(source_batches) == len(accumulation), (
|
||||
f"Some retard fucked up strings in {path}."
|
||||
)
|
||||
if len(source_batches) != len(accumulation):
|
||||
print(f"WARNING: Some retard fucked up strings in {path}")
|
||||
continue
|
||||
r['src'].append(path)
|
||||
r['in'] += source_batches
|
||||
r['out'] += accumulation
|
||||
print(f"INFO: Read data from ({n}) {path}")
|
||||
r['in'] = source_to_np_array(r['in'])
|
||||
r['out'] = whitespace_to_np_array(r['out'])
|
||||
return r
|
||||
|
||||
def get_data():
|
||||
r = []
|
||||
with open('dataset-linux.pkl', 'rb') as f: r = pickle.load(f)
|
||||
def get_data(dataset_file : str) -> {}:
|
||||
r = {}
|
||||
with open(dataset_file, 'rb') as f: r = pickle.load(f)
|
||||
assert len(r['in']) == len(r['out']), (
|
||||
"data in and out sizes were inconsistent ("
|
||||
+ str(r['in'].shape)
|
||||
@ -86,7 +92,6 @@ def get_data():
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(argv) == 2 and argv[1] == 'c': # clean compile
|
||||
with open('dataset-linux.pkl', 'wb') as f: pickle.dump(compile_data(), f)
|
||||
dataset = get_data()
|
||||
print(dataset)
|
||||
with open(DATASET_FILE, 'wb') as f: pickle.dump(compile_data(COMPILE_INPUT_DIRECTORY), f)
|
||||
dataset = get_data(DATASET_FILE)
|
||||
print(dataset['in'].shape, dataset['out'].shape)
|
||||
|
@ -10,11 +10,9 @@ import tard_wrangler
|
||||
if len(argv) > 1:
|
||||
mymodel = model.load_model(argv[1])
|
||||
else:
|
||||
dataset = data.get_data()
|
||||
dataset = data.get_data("dataset-linux.pkl")
|
||||
mymodel = model.make_model(dataset)
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
mymodel.save(MODEL_DIRECTORY + f"model_-_{timestamp}.keras")
|
||||
|
||||
predictions = tard_wrangler.full_predict("data/xop.c.norm", mymodel)
|
||||
tard_wrangler.build("data/xop.c.norm", predictions)
|
||||
tard_wrangler.cat_build()
|
||||
print(tard_wrangler.full_predict("training_set/xop.c", "training_set/xop.c.norm", mymodel))
|
||||
|
@ -1,29 +1,35 @@
|
||||
import subprocess
|
||||
import shlex
|
||||
import numpy as np
|
||||
|
||||
from config import *
|
||||
import data
|
||||
|
||||
def accumulate(path : str, output : str) -> None:
|
||||
process = subprocess.Popen(
|
||||
"converter.out accumulate " + path + " > " + output,
|
||||
shell=True,
|
||||
)
|
||||
BUILD_FILE = "build_file.bin"
|
||||
|
||||
def full_predict(path : str, model) -> []:
|
||||
r = []
|
||||
myinput = data.source_to_np_array(data.get_source(path))
|
||||
for i in myinput:
|
||||
r += model.predict(np.expand_dims(i, axis=0)).astype(np.uint8).tobytes()
|
||||
def build(what : str, predictions : []) -> None:
|
||||
print(predictions)
|
||||
predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
|
||||
with open(BUILD_FILE, "wb") as f: f.write(predictions)
|
||||
shell_what = shlex.quote(what)
|
||||
shell_what = shell_what[0] + '^' + shell_what[1:]
|
||||
process = subprocess.Popen(
|
||||
"converter.out build " + shell_what + " " + BUILD_FILE,
|
||||
shell=True,
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
r, _ = process.communicate()
|
||||
r = r.decode('utf-8')
|
||||
return r
|
||||
|
||||
def build(path : str, predictions : []) -> None:
|
||||
predictions = b''.join([i.to_bytes(1, byteorder='big', signed=False) for i in predictions])
|
||||
with open("build_file", "wb") as f: f.write(predictions)
|
||||
process = subprocess.Popen(
|
||||
"converter.out build " + path + " > out.c",
|
||||
shell=True,
|
||||
)
|
||||
|
||||
def cat_build():
|
||||
with open("out.c") as f: print(f.read())
|
||||
def full_predict(path : str, normpath : str, model) -> [str]:
|
||||
r = ["\n"]
|
||||
batches = data.get_source(path, normpath)
|
||||
for b in batches:
|
||||
b[0] = r[-1]
|
||||
myinput = data.source_to_np_array([b])
|
||||
prediction = model.predict(myinput).astype(np.uint8).tobytes()
|
||||
predicted_string = build(b[1], prediction)
|
||||
r += predicted_string + "\n"
|
||||
r = ''.join(r)
|
||||
return r
|
||||
|
Reference in New Issue
Block a user