init

2024-10-02 19:52:22 +02:00
commit f24fac2ddf
7 changed files with 374 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 venv/
 *.yy.*
 *.out
--- a/converter.l
+++ b/converter.l
@ -0,0 +1,119 @@
 /* @BAKE
    flex -o $*.yy.c $@
    gcc -o $*.out $*.yy.c
   @STOP
 */
 %{
    /* NOTE: this shall be compiled as a shared library so python may call in
    */
    /* XXX: we have a problem on nuking system includes;
             this fucks with trying to be language agnostic;
             i wonder if hopefully the AI can just realize theres never spaces there
    */
    #include <stdio.h>
    int mystate;
    int accumulator = 0;
    #define ECHOS(s) fwrite(s, strlen(s), 1, yyout)
    #define EOL '\n'
 %}
 comment_marker  (\/\*)|(\*\/)
 identifier      \$?[A-Za-z0-9_]+
 modify          [+-]{2}
 assignment      ([+-/*%]|(<<)|(>>))=
 shift           (<<)|(>>)
 word    {identifier}|{comment_marker}|{assignment}|{shift}|{modify}
 %x NORMALIZE ACCUMULATE
 %x IN_STRING
 %option noyywrap nodefault
 %%
    BEGIN mystate;
    if (mystate == ACCUMULATE) {
        ECHOS("[");
    }
 <NORMALIZE>{
 [ ]|\t      { ; }
 \"          {
                ECHO;
                BEGIN IN_STRING;
            }
 {word}|.    {
                ECHO;
                ECHOS(" ");
            }
 \n          {
                ECHO;
                return EOL;
            }
 }
 <ACCUMULATE>{
 [ ]         {
                ++accumulator;
            }
 \t          {
                accumulator += 4;
            }
 \"          {
                BEGIN IN_STRING;
            }
 {word}|.    {
                printf("%d, ", accumulator);
                accumulator = 0;
            }
 \n\n        {
                ECHOS("]\n[0]\n[");
            }
 \n          {
                ECHOS("]\n[");
            }
 }
 <IN_STRING>{
 \\\"    {
            if (mystate == NORMALIZE) {
                ECHO;
            }
        }
 \"      {
            if (mystate == NORMALIZE) {
                ECHO;
            }
            BEGIN mystate;
        }
 .|\n    {
            if (mystate == NORMALIZE) {
                ECHO;
            }
        }
 }
 %%
 signed main(const int argc, const char * const * const argv) {
    if (argc < 3) {
        puts("Usage: converter <mode> <file>");
        return 1;
    }
    if (!strcmp(argv[1], "normalize")) {
        mystate = NORMALIZE;
    } else
    if (!strcmp(argv[1], "accumulate")) {
        mystate = ACCUMULATE;
    } else {
        return 1;
    }
    yyin = fopen(argv[2], "r");
    while(yylex() == EOL) { ; }
    return 0;
 }
--- a/data.py
+++ b/data.py
@ -0,0 +1,43 @@
 import re
 from bidict import bidict
 #CHAR_TOKENS = bidict({
 #	'':   0,
 #	'\n': 1,
 #})
 #CHAR_TOKEN_OFFSET = 1
 def encode(s : str) -> str:
    return re.sub(r'\s+', ' ', s)
 #def decode(s : str,  o : [int]) -> str:
 #    result = []
 #    space_index = 0
 #    for char in s:
 #        if char == ' ':
 #            if o[space_index] in CHAR_TOKENS.inverse:
 #                result.append(CHAR_TOKENS.inverse[o[space_index]])
 #            else:
 #                result.append(' ' * (o[space_index] - CHAR_TOKEN_OFFSET))
 #            space_index += 1
 #        else:
 #            result.append(char)
 #    return ''.join(result)
 def decode(s : str,  o : [int]) -> str:
    result = []
    space_index = 0
    for char in s:
        if char == ' ':
                result.append(' ' * (o[space_index])
            space_index += 1
        else:
            result.append(char)
    return ''.join(result)
 def batchificate(f):
 	BATCH_SIZE = 32
 	s = open(f, 'r').read()
 	s = encode(s)
 print(decode(encode('if ( a  == b ) {   a = c   )'), [2,0,2,2,0,1,0,4,1,1]))
--- a/in/assignments.list
+++ b/in/assignments.list
@ -0,0 +1,50 @@
 x = 10
 y = 3.14
 str_var = "Hello, World!"
 is_true = True
 list_var = [1, 2, 3, 4, 5]
 dict_var = {"key": "value", "another_key": "another_value"}
 tuple_var = (1, 2, 3)
 set_var = {1, 2, 3}
 let x = 10;
 let y = 3.14;
 let strVar = "Hello, World!";
 let isTrue = true;
 let arrayVar = [1, 2, 3, 4, 5];
 let objectVar = {"key": "value", "anotherKey": "anotherValue"};
 let tupleVar = [1, 2, 3];
 let setVar = new Set([1, 2, 3]);
 int x = 10;
 double y = 3.14;
 String strVar = "Hello, World!";
 boolean isTrue = true;
 int[] arrayVar = {1, 2, 3, 4, 5};
 HashMap<String, String> mapVar = new HashMap<>();
 Tuple tuVar = new Tuple(1, 2);
 Set<Integer> setVar = new HashSet<>();
 int x = 10;
 double y = 3.14;
 string strVar = "Hello, World!";
 bool isTrue = true;
 int[] arrayVar = {1, 2, 3, 4, 5};
 Dictionary<string, string> dictVar = new Dictionary<string, string>();
 dictVar.Add("key", "value");
 dictVar.Add("anotherKey", "anotherValue");
 Tuple<int, int> tupleVar = new Tuple<int, int>(1, 2);
 HashSet<int> setVar = new HashSet<int>();
 x = 10
 y = 3.14
 str_var = "Hello, World!"
 is_true = true
 array_var = [1, 2, 3, 4, 5]
 hash_var = {"key" => "value", "another_key" => "another_value"}
 tuple_var = [1, 2, 3]
 set_var = Set.new([1, 2, 3])
 var x: Int = 10
 var y: Double = 3.14
 var strVar: String = "Hello, World!"
 var isTrue: Bool = true
 var arrayVar: [Int] = [1, 2, 3, 4, 5]
 var dictVar: [String: String] = ["key": "value", "anotherKey": "anotherValue"]
 var tupleVar: (Int, Int) = (1, 2)
 var setVar: Set<Int> = [1, 2, 3]
--- a/in/xop.c
+++ b/in/xop.c
@ -0,0 +1,60 @@
 /*
 * Copyright (c) 2023 : Ognjen 'xolatile' Milan Robovic
 *
 * Xop is free software!
 * You will redistribute it or modify it under the terms of
 * the GNU General Public License by Free Software Foundation.
 * And when you do redistribute it or modify it,
 * it will use either version 3 of the License,
 * or (at yours truly opinion) any later version.
 * It is distributed in the hope that it will be useful or harmful,
 * it really depends...
 * But no warranty what so ever, seriously.
 * See GNU/GPLv3.
 */
 #include <xolatile/xtandard.h>
 #include <xolatile/xtandard.c>
 int main (int argc, char * * argv) {
 	int file   = -1;
 	int size   = 0;
 	int offset = 0;
 	unsigned char * buffer = NULL;
 	if (argc != 2) {
 		fatal_failure (1, "xop: xop input");
 	}
 	file = file_open (argv [1], O_RDONLY);
 	size = file_size (file);
 	buffer = allocate (size);
 	file_read (file, buffer, size);
 	file = file_close (file);
 	do {
 		int byte = (int) buffer [offset];
 		if (byte == 0X90) {
 			echo_new_line  ();
 			terminal_style (EFFECT_NORMAL, COLOUR_YELLOW);
 			echo_byte      ((int) buffer [offset]);
 			terminal_style (-1, -1);
 		} else {
 			echo_byte (buffer [offset]);
 		}
 		++offset;
 	} while (offset != size);
 	echo_new_line ();
 	buffer = deallocate (buffer);
 	return (EXIT_SUCCESS);
 }
--- a/main.py
+++ b/main.py
@ -0,0 +1,98 @@
 import subprocess
 import os
 import numpy as np
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 import tensorflow
 from tensorflow import keras
 from keras import layers
 LINE_WIDTH = 80
 MAX_SHIMS  = LINE_WIDTH - 1
 def get_data():
 	r = []
 	def get_source(path : str) -> [str]:
 		'''returns source file 3 line batches'''
 		r = []
 		with open(path, 'r') as file:
 			lines = []
 			for line in file:
 				lines.append(line.strip())
 			r = [lines[i:i + 3] for i in range(0, len(lines), 3)]
 		return r
 	def source_to_np_array(source_batches : []) -> np.array:
 		r = []
 		for s in source_batches:
 			ascii_list = []
 			for l in s:
 				l = l[:LINE_WIDTH]
 				l = l.ljust(LINE_WIDTH)
 				l = [ord(i) for i in l]
 				ascii_list += l
 			n = np.reshape(ascii_list, (3, -1, 1))
 			n = np.expand_dims(n, axis=0)
 			r.append(n)
 		return r
 	def get_whitespace(path : str) -> [int]:
 		'''XXX returns the whitespace list of every middle line'''
 		r = []
 		output_file = "muf_file.txt"
 		process = subprocess.Popen(
 					"converter.out accumulate " + path + " > " + output_file,
 					shell=True,
 		)
 		with open(output_file, 'r') as file:
 			for n, line in enumerate(file):
 				if ((n + 2) % 3) != 0: continue
 				r.append(eval(line))
 		return r
 	source = source_to_np_array(get_source("in/xop.c"))
 	whitespace = get_whitespace("in/xop.c")
 	whitespace = [np.array(i) for i in whitespace]
 	r = {'in': source, 'out': whitespace}
 	return r
 data = get_data()
 assert len(data['in']) == len(data['in']), "data in and out sizes were inconsistent."
 print(data['in'], data['out'])
 model = keras.Sequential([
 	layers.Conv2D(
 		filters=16,
 		kernel_size=(3,3),
 		strides=(1,1),
 		activation='relu',
 		padding='valid',
 		input_shape=(3,LINE_WIDTH,1)
 	),
 	#layers.Conv2D(
 	#	filters=32,
 	#	kernel_size=(3,7),
 	#	activation='relu',
 	#	padding='valid'
 	#),
 	#layers.Conv2D(
 	#	filters=64,
 	#	kernel_size=(3,13),
 	#	activation='relu',
 	#	padding='valid'
 	#),
 	layers.Flatten(),
 	layers.Dense(64, activation='relu'),
 	layers.Dense(MAX_SHIMS, activation='softmax')
 ])
 model.compile(
 	optimizer='adam',
 	loss='mse',
 	metrics=['accuracy']
 )
 model.fit(data['in'], data['out'],
    verbose=2,
    batch_size=10,
    epochs=50,
    shuffle=True,
 )
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
 tensorflow