init

2024-10-02 19:52:22 +02:00
commit f24fac2ddf
7 changed files with 374 additions and 0 deletions
--- a/data.py
+++ b/data.py
@ -0,0 +1,43 @@
+import re
+from bidict import bidict
+
+#CHAR_TOKENS = bidict({
+#	'':   0,
+#	'\n': 1,
+#})
+#CHAR_TOKEN_OFFSET = 1
+
+def encode(s : str) -> str:
+    return re.sub(r'\s+', ' ', s)
+
+#def decode(s : str,  o : [int]) -> str:
+#    result = []
+#    space_index = 0
+#    for char in s:
+#        if char == ' ':
+#            if o[space_index] in CHAR_TOKENS.inverse:
+#                result.append(CHAR_TOKENS.inverse[o[space_index]])
+#            else:
+#                result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET))
+#            space_index += 1
+#        else:
+#            result.append(char)
+#    return ''.join(result)
+
+def decode(s : str,  o : [int]) -> str:
+    result = []
+    space_index = 0
+    for char in s:
+        if char == ' ':
+                result.append(' ' * (o[space_index])
+            space_index += 1
+        else:
+            result.append(char)
+    return ''.join(result)
+
+def batchificate(f):
+	BATCH_SIZE = 32
+	s = open(f, 'r').read()
+	s = encode(s)
+
+print(decode(encode('if ( a  == b ) {   a = c   )'), [2,0,2,2,0,1,0,4,1,1]))