diff --git a/.gitignore b/.gitignore
index 1f0f7a5..b0b4332 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 venv/
 *.yy.*
 *.out
+__pycache__/
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..816987c
--- /dev/null
+++ b/config.py
@@ -0,0 +1,2 @@
+LINE_WIDTH = 80
+MAX_SHIMS  = LINE_WIDTH - 1
diff --git a/data.py b/data.py
index d4db29c..68fb6c8 100644
--- a/data.py
+++ b/data.py
@@ -1,43 +1,53 @@
-import re
-from bidict import bidict
+import subprocess
+import numpy as np
 
-#CHAR_TOKENS = bidict({
-#	'':   0,
-#	'\n': 1,
-#})
-#CHAR_TOKEN_OFFSET = 1
+from config import *
 
-def encode(s : str) -> str:
-    return re.sub(r'\s+', ' ', s)
+def get_data():
+	r = []
+	INPUT_FILE = "data/xop.c"
+	def get_source(path : str) -> [str]:
+		'''returns source file 3 line batches'''
+		r = []
+		with open(path, 'r') as file:
+			lines = []
+			for line in file:
+				lines.append(line.strip())
+			r = [lines[i:i + 3] for i in range(0, len(lines), 3)]
+		return r
+	def source_to_np_array(source_batches : []) -> np.array:
+		r = []
+		for s in source_batches:
+			ascii_list = []
+			for l in s:
+				l = l[:LINE_WIDTH]
+				l = l.ljust(LINE_WIDTH)
+				l = [ord(i) for i in l]
+				ascii_list += l
+			n = np.reshape(ascii_list, (3, -1, 1))
+			n = np.expand_dims(n, axis=0)
+			r.append(n)
+		return r
+	def get_whitespace(path : str) -> [int]:
+		'''XXX returns the whitespace list of every middle line'''
+		r = []
+		output_file = "muf_file.txt"
+		process = subprocess.Popen(
+					"converter.out accumulate " + path + " > " + output_file,
+					shell=True,
+		)
+		with open(output_file, 'r') as file:
+			for n, line in enumerate(file):
+				if ((n + 2) % 3) != 0: continue
+				r.append(eval(line))
+		return r
+	source = source_to_np_array(get_source(INPUT_FILE))
+	whitespace = get_whitespace(INPUT_FILE)
+	whitespace = [np.array(i) for i in whitespace]
+	r = {'in': source, 'out': whitespace}
+	assert len(r['in']) == len(r['in']), "data in and out sizes were inconsistent."
+	return r
 
-#def decode(s : str,  o : [int]) -> str:
-#    result = []
-#    space_index = 0
-#    for char in s:
-#        if char == ' ':
-#            if o[space_index] in CHAR_TOKENS.inverse:
-#                result.append(CHAR_TOKENS.inverse[o[space_index]])
-#            else:
-#                result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET))
-#            space_index += 1
-#        else:
-#            result.append(char)
-#    return ''.join(result)
-
-def decode(s : str,  o : [int]) -> str:
-    result = []
-    space_index = 0
-    for char in s:
-        if char == ' ':
-                result.append(' ' * (o[space_index])
-            space_index += 1
-        else:
-            result.append(char)
-    return ''.join(result)
-
-def batchificate(f):
-	BATCH_SIZE = 32
-	s = open(f, 'r').read()
-	s = encode(s)
-
-print(decode(encode('if ( a  == b ) {   a = c   )'), [2,0,2,2,0,1,0,4,1,1]))
+if __name__ == "__main__":
+	dataset = get_data()
+	print(dataset)
diff --git a/formatter.py b/formatter.py
index f6b471e..dfd2b51 100644
--- a/formatter.py
+++ b/formatter.py
@@ -1,62 +1,15 @@
-import subprocess
-import os
 import numpy as np
+import os
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 import tensorflow
 from tensorflow import keras
 from keras import layers
 
-LINE_WIDTH = 80
-MAX_SHIMS  = LINE_WIDTH - 1
+from config import *
+import data
 
-
-def get_data():
-	r = []
-	def get_source(path : str) -> [str]:
-		'''returns source file 3 line batches'''
-		r = []
-		with open(path, 'r') as file:
-			lines = []
-			for line in file:
-				lines.append(line.strip())
-			r = [lines[i:i + 3] for i in range(0, len(lines), 3)]
-		return r
-	def source_to_np_array(source_batches : []) -> np.array:
-		r = []
-		for s in source_batches:
-			ascii_list = []
-			for l in s:
-				l = l[:LINE_WIDTH]
-				l = l.ljust(LINE_WIDTH)
-				l = [ord(i) for i in l]
-				ascii_list += l
-			n = np.reshape(ascii_list, (3, -1, 1))
-			n = np.expand_dims(n, axis=0)
-			r.append(n)
-		return r
-	def get_whitespace(path : str) -> [int]:
-		'''XXX returns the whitespace list of every middle line'''
-		r = []
-		output_file = "muf_file.txt"
-		process = subprocess.Popen(
-					"converter.out accumulate " + path + " > " + output_file,
-					shell=True,
-		)
-		with open(output_file, 'r') as file:
-			for n, line in enumerate(file):
-				if ((n + 2) % 3) != 0: continue
-				r.append(eval(line))
-		return r
-	source = source_to_np_array(get_source("in/xop.c"))
-	whitespace = get_whitespace("in/xop.c")
-	whitespace = [np.array(i) for i in whitespace]
-	r = {'in': source, 'out': whitespace}
-	return r
-
-data = get_data()
-assert len(data['in']) == len(data['in']), "data in and out sizes were inconsistent."
-print(data['in'], data['out'])
+dataset = data.get_data()
 
 model = keras.Sequential([
 	layers.Conv2D(
@@ -90,7 +43,7 @@ model.compile(
 	metrics=['accuracy']
 )
 
-model.fit(data['in'], data['out'],
+model.fit(dataset['in'], dataset['out'],
     verbose=2,
     batch_size=10,
     epochs=50,