This commit is contained in:
anon
2024-10-02 19:52:22 +02:00
commit f24fac2ddf
7 changed files with 374 additions and 0 deletions

43
data.py Normal file
View File

@ -0,0 +1,43 @@
import re
from bidict import bidict
#CHAR_TOKENS = bidict({
# '': 0,
# '\n': 1,
#})
#CHAR_TOKEN_OFFSET = 1
def encode(s : str) -> str:
return re.sub(r'\s+', ' ', s)
#def decode(s : str, o : [int]) -> str:
# result = []
# space_index = 0
# for char in s:
# if char == ' ':
# if o[space_index] in CHAR_TOKENS.inverse:
# result.append(CHAR_TOKENS.inverse[o[space_index]])
# else:
# result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET))
# space_index += 1
# else:
# result.append(char)
# return ''.join(result)
def decode(s : str, o : [int]) -> str:
result = []
space_index = 0
for char in s:
if char == ' ':
result.append(' ' * (o[space_index])
space_index += 1
else:
result.append(char)
return ''.join(result)
def batchificate(f):
BATCH_SIZE = 32
s = open(f, 'r').read()
s = encode(s)
print(decode(encode('if ( a == b ) { a = c )'), [2,0,2,2,0,1,0,4,1,1]))