From f24fac2ddfe34a851d494cf1ee3c083b01bbac11 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 2 Oct 2024 19:52:22 +0200 Subject: [PATCH] init --- .gitignore | 3 ++ converter.l | 119 ++++++++++++++++++++++++++++++++++++++++++++ data.py | 43 ++++++++++++++++ in/assignments.list | 50 +++++++++++++++++++ in/xop.c | 60 ++++++++++++++++++++++ main.py | 98 ++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 7 files changed, 374 insertions(+) create mode 100644 .gitignore create mode 100644 converter.l create mode 100644 data.py create mode 100644 in/assignments.list create mode 100644 in/xop.c create mode 100644 main.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1f0f7a5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +venv/ +*.yy.* +*.out diff --git a/converter.l b/converter.l new file mode 100644 index 0000000..cbbcd48 --- /dev/null +++ b/converter.l @@ -0,0 +1,119 @@ +/* @BAKE + flex -o $*.yy.c $@ + gcc -o $*.out $*.yy.c + @STOP + */ +%{ + /* NOTE: this shall be compiled as a shared library so python may call in + */ + /* XXX: we have a problem on nuking system includes; + this fucks with trying to be language agnostic; + i wonder if hopefully the AI can just realize theres never spaces there + */ + #include + + int mystate; + + int accumulator = 0; + + #define ECHOS(s) fwrite(s, strlen(s), 1, yyout) + + #define EOL '\n' +%} + +comment_marker (\/\*)|(\*\/) +identifier \$?[A-Za-z0-9_]+ +modify [+-]{2} +assignment ([+-/*%]|(<<)|(>>))= +shift (<<)|(>>) + +word {identifier}|{comment_marker}|{assignment}|{shift}|{modify} + +%x NORMALIZE ACCUMULATE +%x IN_STRING +%option noyywrap nodefault +%% + BEGIN mystate; + if (mystate == ACCUMULATE) { + ECHOS("["); + } + +{ +[ ]|\t { ; } +\" { + ECHO; + BEGIN IN_STRING; + } +{word}|. { + ECHO; + ECHOS(" "); + } +\n { + ECHO; + return EOL; + } +} + +{ +[ ] { + ++accumulator; + } +\t { + accumulator += 4; + } +\" { + BEGIN IN_STRING; + } +{word}|. { + printf("%d, ", accumulator); + accumulator = 0; + } +\n\n { + ECHOS("]\n[0]\n["); + } +\n { + ECHOS("]\n["); + } +} + +{ +\\\" { + if (mystate == NORMALIZE) { + ECHO; + } + } +\" { + if (mystate == NORMALIZE) { + ECHO; + } + BEGIN mystate; + } +.|\n { + if (mystate == NORMALIZE) { + ECHO; + } + } +} +%% + +signed main(const int argc, const char * const * const argv) { + if (argc < 3) { + puts("Usage: converter "); + return 1; + } + + if (!strcmp(argv[1], "normalize")) { + mystate = NORMALIZE; + } else + if (!strcmp(argv[1], "accumulate")) { + mystate = ACCUMULATE; + } else { + return 1; + } + + yyin = fopen(argv[2], "r"); + + while(yylex() == EOL) { ; } + + return 0; +} diff --git a/data.py b/data.py new file mode 100644 index 0000000..d4db29c --- /dev/null +++ b/data.py @@ -0,0 +1,43 @@ +import re +from bidict import bidict + +#CHAR_TOKENS = bidict({ +# '': 0, +# '\n': 1, +#}) +#CHAR_TOKEN_OFFSET = 1 + +def encode(s : str) -> str: + return re.sub(r'\s+', ' ', s) + +#def decode(s : str, o : [int]) -> str: +# result = [] +# space_index = 0 +# for char in s: +# if char == ' ': +# if o[space_index] in CHAR_TOKENS.inverse: +# result.append(CHAR_TOKENS.inverse[o[space_index]]) +# else: +# result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET)) +# space_index += 1 +# else: +# result.append(char) +# return ''.join(result) + +def decode(s : str, o : [int]) -> str: + result = [] + space_index = 0 + for char in s: + if char == ' ': + result.append(' ' * (o[space_index]) + space_index += 1 + else: + result.append(char) + return ''.join(result) + +def batchificate(f): + BATCH_SIZE = 32 + s = open(f, 'r').read() + s = encode(s) + +print(decode(encode('if ( a == b ) { a = c )'), [2,0,2,2,0,1,0,4,1,1])) diff --git a/in/assignments.list b/in/assignments.list new file mode 100644 index 0000000..0af0d3c --- /dev/null +++ b/in/assignments.list @@ -0,0 +1,50 @@ +x = 10 +y = 3.14 +str_var = "Hello, World!" +is_true = True +list_var = [1, 2, 3, 4, 5] +dict_var = {"key": "value", "another_key": "another_value"} +tuple_var = (1, 2, 3) +set_var = {1, 2, 3} +let x = 10; +let y = 3.14; +let strVar = "Hello, World!"; +let isTrue = true; +let arrayVar = [1, 2, 3, 4, 5]; +let objectVar = {"key": "value", "anotherKey": "anotherValue"}; +let tupleVar = [1, 2, 3]; +let setVar = new Set([1, 2, 3]); +int x = 10; +double y = 3.14; +String strVar = "Hello, World!"; +boolean isTrue = true; +int[] arrayVar = {1, 2, 3, 4, 5}; +HashMap mapVar = new HashMap<>(); +Tuple tuVar = new Tuple(1, 2); +Set setVar = new HashSet<>(); +int x = 10; +double y = 3.14; +string strVar = "Hello, World!"; +bool isTrue = true; +int[] arrayVar = {1, 2, 3, 4, 5}; +Dictionary dictVar = new Dictionary(); +dictVar.Add("key", "value"); +dictVar.Add("anotherKey", "anotherValue"); +Tuple tupleVar = new Tuple(1, 2); +HashSet setVar = new HashSet(); +x = 10 +y = 3.14 +str_var = "Hello, World!" +is_true = true +array_var = [1, 2, 3, 4, 5] +hash_var = {"key" => "value", "another_key" => "another_value"} +tuple_var = [1, 2, 3] +set_var = Set.new([1, 2, 3]) +var x: Int = 10 +var y: Double = 3.14 +var strVar: String = "Hello, World!" +var isTrue: Bool = true +var arrayVar: [Int] = [1, 2, 3, 4, 5] +var dictVar: [String: String] = ["key": "value", "anotherKey": "anotherValue"] +var tupleVar: (Int, Int) = (1, 2) +var setVar: Set = [1, 2, 3] diff --git a/in/xop.c b/in/xop.c new file mode 100644 index 0000000..9a77c9e --- /dev/null +++ b/in/xop.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023 : Ognjen 'xolatile' Milan Robovic + * + * Xop is free software! + * You will redistribute it or modify it under the terms of + * the GNU General Public License by Free Software Foundation. + * And when you do redistribute it or modify it, + * it will use either version 3 of the License, + * or (at yours truly opinion) any later version. + * It is distributed in the hope that it will be useful or harmful, + * it really depends... + * But no warranty what so ever, seriously. + * See GNU/GPLv3. + */ + +#include +#include + +int main (int argc, char * * argv) { + int file = -1; + int size = 0; + int offset = 0; + + unsigned char * buffer = NULL; + + if (argc != 2) { + fatal_failure (1, "xop: xop input"); + } + + file = file_open (argv [1], O_RDONLY); + size = file_size (file); + + buffer = allocate (size); + + file_read (file, buffer, size); + + file = file_close (file); + + do { + int byte = (int) buffer [offset]; + if (byte == 0X90) { + echo_new_line (); + terminal_style (EFFECT_NORMAL, COLOUR_YELLOW); + echo_byte ((int) buffer [offset]); + terminal_style (-1, -1); + } else { + echo_byte (buffer [offset]); + } + + ++offset; + } while (offset != size); + + echo_new_line (); + + buffer = deallocate (buffer); + + return (EXIT_SUCCESS); +} + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..f6b471e --- /dev/null +++ b/main.py @@ -0,0 +1,98 @@ +import subprocess +import os +import numpy as np +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +import tensorflow +from tensorflow import keras +from keras import layers + +LINE_WIDTH = 80 +MAX_SHIMS = LINE_WIDTH - 1 + + +def get_data(): + r = [] + def get_source(path : str) -> [str]: + '''returns source file 3 line batches''' + r = [] + with open(path, 'r') as file: + lines = [] + for line in file: + lines.append(line.strip()) + r = [lines[i:i + 3] for i in range(0, len(lines), 3)] + return r + def source_to_np_array(source_batches : []) -> np.array: + r = [] + for s in source_batches: + ascii_list = [] + for l in s: + l = l[:LINE_WIDTH] + l = l.ljust(LINE_WIDTH) + l = [ord(i) for i in l] + ascii_list += l + n = np.reshape(ascii_list, (3, -1, 1)) + n = np.expand_dims(n, axis=0) + r.append(n) + return r + def get_whitespace(path : str) -> [int]: + '''XXX returns the whitespace list of every middle line''' + r = [] + output_file = "muf_file.txt" + process = subprocess.Popen( + "converter.out accumulate " + path + " > " + output_file, + shell=True, + ) + with open(output_file, 'r') as file: + for n, line in enumerate(file): + if ((n + 2) % 3) != 0: continue + r.append(eval(line)) + return r + source = source_to_np_array(get_source("in/xop.c")) + whitespace = get_whitespace("in/xop.c") + whitespace = [np.array(i) for i in whitespace] + r = {'in': source, 'out': whitespace} + return r + +data = get_data() +assert len(data['in']) == len(data['in']), "data in and out sizes were inconsistent." +print(data['in'], data['out']) + +model = keras.Sequential([ + layers.Conv2D( + filters=16, + kernel_size=(3,3), + strides=(1,1), + activation='relu', + padding='valid', + input_shape=(3,LINE_WIDTH,1) + ), + #layers.Conv2D( + # filters=32, + # kernel_size=(3,7), + # activation='relu', + # padding='valid' + #), + #layers.Conv2D( + # filters=64, + # kernel_size=(3,13), + # activation='relu', + # padding='valid' + #), + layers.Flatten(), + layers.Dense(64, activation='relu'), + layers.Dense(MAX_SHIMS, activation='softmax') +]) + +model.compile( + optimizer='adam', + loss='mse', + metrics=['accuracy'] +) + +model.fit(data['in'], data['out'], + verbose=2, + batch_size=10, + epochs=50, + shuffle=True, +) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0f57144 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +tensorflow