ai_formatter/data.py

from glob import glob
import numpy as np
import pickle
import sys
from sys import argv

from config import *
import tard_wrangler

#MAX_DATA_LIMIT = sys.maxsize
MAX_DATA_LIMIT = 1000

def get_source(path : str, normpath : str) -> [str]:
	'''returns source file in $SOURCE_LINE_BATCH_SIZE line batches'''
	r = []
	# read data
	with open(path, 'r') as f: lines = [line[:-1] for line in f]
	with open(normpath, 'r') as f: normlines = [line[:-1] for line in f]
	# pad with empty lines
	for i in range(int((SOURCE_LINE_BATCH_SIZE-1)/2)):
		lines.insert(0, "")
		normlines.append("")
	# batch
	for i in range(len(lines)-1):
		r.append([lines[i]] + normlines[i:i+SOURCE_LINE_BATCH_SIZE-1])
	return r

def source_to_np_array(source_batches : []) -> np.array:
	'''returns image like array from batches'''
	r = []
	for s in source_batches:
		ascii_list = []
		for l in s:
			l = l[:LINE_WIDTH]			# cut long lines
			l = l.ljust(LINE_WIDTH)		# pad short lines
			l = [ord(i) for i in l]
			ascii_list += l
		n = np.reshape(ascii_list, (3, -1, 1))
		r.append(n)
	r = np.array(r)
	return r

def read_acc(path : str) -> [[int]]:
	r = []
	with open(path, 'r') as file:
		for line in file:
			try:
				l = eval(line)
				if len(l) < MAX_SHIMS: l = l + [0] * (MAX_SHIMS - len(l))
				else: l = l[:MAX_SHIMS]
				r.append(l)
			except: pass
	return r

def whitespace_to_np_array(spaces : []) -> np.array:
	r = spaces
	r = np.array(r).reshape(len(spaces), -1)
	return r

def compile_data(from_dir : str) -> {}:
	r = {'in': [], 'out': [], 'src': []}
	for n, path in enumerate(glob(from_dir + "/*.c")):
		if n > MAX_DATA_LIMIT: break
		acc_path  = path + ".acc"
		norm_path = path + ".norm"
		source_batches = get_source(path, norm_path)
		accumulation   = read_acc(acc_path)
		if len(source_batches) != len(accumulation):
			print(f"WARNING: Some retard fucked up strings in {path}")
			continue
		r['src'].append(path)
		r['in']  += source_batches
		r['out'] += accumulation
		print(f"INFO: Read data from ({n}) {path}")
	r['in']  = source_to_np_array(r['in'])
	r['out'] = whitespace_to_np_array(r['out'])
	return r

def get_data(dataset_file : str) -> {}:
	r = {}
	with open(dataset_file, 'rb') as f: r = pickle.load(f)
	assert len(r['in']) == len(r['out']), (
			"data in and out sizes were inconsistent ("
			+ str(r['in'].shape)
			+ " "
			+ str(r['out'].shape)
			+ "."
	)
	return r

if __name__ == "__main__":
	if len(argv) == 2 and argv[1] == 'c': # clean compile
		with open(DATASET_FILE, 'wb') as f: pickle.dump(compile_data(COMPILE_INPUT_DIRECTORY), f)
	dataset = get_data(DATASET_FILE)
	print(dataset['in'].shape, dataset['out'].shape)