diff --git a/Makefile b/Makefile index f5a64d3..a769a7d 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,14 @@ SOURCE.d := source/ OBJECT.d := object/ CFLAGS += -Wall -Wpedantic -I${SOURCE.d}/ -CPPFLAGS += ${CFLAGS} + +ifeq (${DEBUG}, 1) + LFLAGS += --debug --trace + CFLAGS += -O0 -ggdb -fno-inline + CPPFLAGS += -DDEBUG=1 +endif + +CXXFLAGS += ${CFLAGS} -std=gnu++20 OUTPUT := jeger @@ -14,13 +21,15 @@ ${OUTPUT}: object/main.o object/generator.o object/jeger.yy.o test: ./${OUTPUT} test/brainfuck.l 2>&1 | perl -pe "s/(\[.{1,4}\] = 128)/\x1b[90m\1\x1b[0m/g" cat jeger.yy.c + gcc jeger.yy.c clean: + -rm ${OBJECT.d}/*.yy.* -rm ${OBJECT.d}/*.o -rm ${OUTPUT} object/%.yy.cpp: source/%.l - flex -o $@ $< + flex ${LFLAGS} -o $@ $< object/%.o: source/%.c ${COMPILE.c} $< -o $@ diff --git a/source/generator.c b/source/generator.c index 854a0f3..8d3a64b 100644 --- a/source/generator.c +++ b/source/generator.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -6,26 +7,31 @@ #include "jeger.h" #include "snippets.inc" +// XXX //#define AS_SYMBOL(c) (c-'a') #define AS_SYMBOL(c) ((int)c) #define TOKEN_OFFSET 128 /* XXX */ +// --- +rule_t * rules; +int n_rules = 0; +char * * state_names; +int n_states = 0; int alphabet_size = 128; -rule_t * patterns; char * definition_section_code_buffer; char * code_section_code_buffer; -static int n_states = 0; static inline -void put_header(FILE * f, const int alphabet_size, const int n_states, const int no_match) { +void put_header(FILE * f, const int alphabet_size, const int no_match) { #define DEFINE_INT(m, n) fprintf(f, "#define " #m " %d\n", n); #define DEFINE_STR(m, s) fprintf(f, "#define " #m " %s\n", s); DEFINE_INT(ALPHABET_SIZE, alphabet_size); - DEFINE_INT(N_STATES, n_states); + DEFINE_INT(N_RULES, n_rules); DEFINE_INT(NO_MATCH, no_match); + DEFINE_STR(BEGIN, "state = "); DEFINE_STR(REVERSE, "(direction *= -1)"); fputs("#define AS_SYMBOL(c) c\n", /* (c-'a')\n */ f); @@ -35,15 +41,17 @@ void put_header(FILE * f, const int alphabet_size, const int n_states, const int // DEFINE_STR(TRACE, ""); // DEFINE_STR(TRACE_DEFAULT, ""); + // XXX we want no globals fputs("int mlen;\n", f); + fputs("int direction = 1;\n", f); fputs("\n", f); } static inline void put_table(FILE * f, const int * table, char * * prefixes, int n_states, int alphabet_size) { - fputs("int table[N_STATES][ALPHABET_SIZE] = {\n", f); - for (int i = 0; i < n_states; i++) { + fputs("int table[N_RULES][ALPHABET_SIZE] = {\n", f); + for (int i = 0; i < n_rules; i++) { fprintf(f, "\t[%d] = {", i); for (int h = 0; h < alphabet_size; h++) { /* NOTE: we have to awkwardly escate "\" and "'", @@ -67,13 +75,25 @@ void put_table(FILE * f, const int * table, char * * prefixes, int n_states, int } static -void put_state_table(FILE * f, int * states, int n) { - fprintf(f, "int state_table[%d] = {\n", n); - for (int i = 0; i < n; i++) { - if (states[i] == -1) { break; } +void put_state_table(FILE * f, int * states) { + // XXX do i even need this table? + fprintf(f, "int state_table[%d] = {\n", n_states); + for (int i = 0; i < n_states; i++) { + if (states[i] == -1) { break; } // XXX fprintf(f, "\t[%d] = %d,\n", i, states[i]); } fputs("};\n\n", f); + + for (int i = 0; i < n_states; i++) { + fprintf( + f, + "#define %s %d\n", + state_names[i], + states[i] + ); + } + + fputs("\n", f); } static @@ -87,54 +107,37 @@ int get_most_common_prefix(const char * pattern, char * * prefixes, int current_ return r; } -static -int get_max_number_of_states(const rule_t * patterns) { - int r = 0; - int state_max_accumulator = -1; - for (int i = 0; patterns[i].pattern != NULL; i++) { - r += strlen(patterns[i].pattern); - if (patterns[i].state > state_max_accumulator) { - state_max_accumulator = patterns[i].state; - ++r; - } - } - - return r; -} - static void make_and_put_table(FILE * f) { // Init - n_states = get_max_number_of_states(patterns); - int states[n_states]; INITIALIZE_ARRAY(states, n_states, -1); states[0] = 0; - char * prefixes[n_states]; - INITIALIZE_ARRAY(prefixes, n_states, NULL); + char * prefixes[n_rules]; + INITIALIZE_ARRAY(prefixes, n_rules, NULL); - int table[n_states][alphabet_size]; - INITIALIZE_MATRIX(table, n_states, alphabet_size, TOKEN_OFFSET); + int table[n_rules][alphabet_size]; + INITIALIZE_MATRIX(table, n_rules, alphabet_size, TOKEN_OFFSET); // Construct table int next_free_slot = 1; for ( - int pattern_index = 0; - patterns[pattern_index].pattern != NULL; - pattern_index++ + int rule_index = 0; + rules[rule_index].pattern != NULL; + rule_index++ ) { - const rule_t * pattern = &patterns[pattern_index]; + const rule_t * rule = &rules[rule_index]; - int current_state_start = states[pattern->state]; + int current_state_start = states[rule->state]; if (current_state_start == -1) { current_state_start = next_free_slot; - states[pattern->state] = next_free_slot; + states[rule->state] = next_free_slot; ++next_free_slot; } int most_common_prefix_state = get_most_common_prefix( - pattern->pattern, + rule->pattern, prefixes, current_state_start ); @@ -142,30 +145,30 @@ void make_and_put_table(FILE * f) { prefixes[current_state_start] = strdup(""); int most_common_prefix_index = strlen(prefixes[most_common_prefix_state]); - const char * last_char = pattern->pattern + most_common_prefix_index; + const char * last_char = rule->pattern + most_common_prefix_index; table [most_common_prefix_state] - [AS_SYMBOL(pattern->pattern[most_common_prefix_index])] + [AS_SYMBOL(rule->pattern[most_common_prefix_index])] = next_free_slot ; for ( int i = most_common_prefix_index+1; - pattern->pattern[i] != '\0'; + rule->pattern[i] != '\0'; i++, next_free_slot++ ) { table [next_free_slot] - [AS_SYMBOL(pattern->pattern[i])] + [AS_SYMBOL(rule->pattern[i])] = next_free_slot + 1 ; - prefixes[next_free_slot] = strndup(pattern->pattern, i); - last_char = pattern->pattern + i; + prefixes[next_free_slot] = strndup(rule->pattern, i); + last_char = rule->pattern + i; } - int last_position = (last_char == pattern->pattern - || most_common_prefix_index == last_char - pattern->pattern) + int last_position = (last_char == rule->pattern + || most_common_prefix_index == last_char - rule->pattern) ? most_common_prefix_state : next_free_slot-1 ; @@ -173,21 +176,16 @@ void make_and_put_table(FILE * f) { table [last_position] [AS_SYMBOL(*last_char)] - = TOKEN_OFFSET+1 + pattern_index + = TOKEN_OFFSET+1 + rule_index ; - put_table(stderr, (int*)table, prefixes, n_states, alphabet_size); + put_table(stderr, (int*)table, prefixes, n_rules, alphabet_size); fputs("/* ================== */\n", stderr); } - /* `get_max_number_of_states()` most likely over estimated, - * so we cut back the table to the number of rows that were actually used. - */ - n_states = next_free_slot; - // Output - put_table(f, (int*)table, prefixes, n_states, alphabet_size); - put_state_table(f, states, n_states); + put_table(f, (int*)table, prefixes, n_rules, alphabet_size); + put_state_table(f, states); } static @@ -195,16 +193,29 @@ void put_functions(FILE * f) { fputs(yy_lookup_str, f); fputs(yy_lex_str_start, f); - for (rule_t * rule = patterns; rule->code != NULL; rule++) { - fprintf(f, "\tcase %ld: {\n" "%s\n" "\t} break;\n", rule - patterns, rule->code); + for (rule_t * rule = rules; rule->code != NULL; rule++) { + fprintf(f, "\tcase %ld: {\n" "%s\n" "\t} break;\n", rule - rules, rule->code); } fputs(yy_lex_str_end, f); } +void deinit_jeger(void) { + for (int i = 0; i < n_states; i++) { + free(state_names[i]); + } + for (int i = 0; i < n_rules; i++) { + free(rules[i].pattern); + free(rules[i].code); + } + + n_rules = 0; + n_states = 0; +} + void generate(const char * filename) { FILE * f = fopen(filename, "w"); - put_header(f, alphabet_size, n_states, TOKEN_OFFSET); + put_header(f, alphabet_size, TOKEN_OFFSET); make_and_put_table(f); fputs(definition_section_code_buffer, f); diff --git a/source/jeger.h b/source/jeger.h index 3b88800..95e6ec9 100644 --- a/source/jeger.h +++ b/source/jeger.h @@ -1,18 +1,37 @@ #ifndef JEGER_H #define JEGER_H +#include + +// Structs typedef struct { int state; char * pattern; char * code; } rule_t; -extern rule_t * patterns; +typedef enum { + STATIC_TABLE, + SWITCH_TABLE, +} table_t; + +// Globals +extern rule_t * rules; +extern int n_rules; +extern char * * state_names; +extern int n_states; extern int alphabet_size; +extern table_t table_type; + +extern char * prefix; +extern bool do_setup_lineno; + extern char * definition_section_code_buffer; extern char * code_section_code_buffer; +// Functions extern void generate(const char * filename); +extern void deinit_jeger(void); #endif diff --git a/source/jeger.l b/source/jeger.l index 0850adb..4f83952 100644 --- a/source/jeger.l +++ b/source/jeger.l @@ -2,9 +2,14 @@ /* NOTE: its technically very bad taste to implement a minimalist lex subset with flex. + it was a devtime optimization. maybe it should be reimplemented in pure C when possible. */ + + extern "C" { #include "jeger.h" + } + #include #include #include @@ -13,13 +18,22 @@ #include #include + char * prefix = strdup("yy"); + table_t table_type = STATIC_TABLE; + bool do_setup_lineno = false; + using namespace std; - void set_alphanet_range(char s, char e) { + typedef struct { + char * pattern; + char * code; + } rule_t2; + + static void set_alphanet_range(char s, char e) { // XXX not implemented } - void yyerror(const char * fmt, ...) { + static void yyerror(const char * fmt, ...) { va_list va; va_start(va, fmt); @@ -30,29 +44,18 @@ va_end(va); } - typedef enum { - STATIC_TABLE, - SWITCH_TABLE, - } table_t; - - typedef struct { - char * pattern; - char * code; - } rule_t2; - string definition_section_code_buffer_str; string code_section_code_buffer_str; - map> rules; - map>::iterator current_state; - string patter_buffer; - string code_buffer; + static map> rules_map; + static map>::iterator current_state; + static string patter_buffer; + static string code_buffer; - char * prefix = strdup("yy"); - table_t table_type = STATIC_TABLE; - bool do_setup_lineno = false; + static int nest_counter = 0; - int nest_counter = 0; + static int source_state; + static string * source_buffer; %} %x IN_DEFINITION_SECTION IN_RULE_SECTION IN_CODE_SECTION %x IN_DEFINITION_SECTION_CODE @@ -68,6 +71,7 @@ value \"[-a-z]+\" %option yylineno %option nodefault %option noyywrap +%option nounput %% BEGIN IN_DEFINITION_SECTION; @@ -76,10 +80,6 @@ value \"[-a-z]+\" BEGIN IN_RULE_SECTION; } ^\%\{ { - if (definition_section_code_buffer_str != "") { - return 1; - } - BEGIN IN_DEFINITION_SECTION_CODE; } \%x { @@ -88,6 +88,12 @@ value \"[-a-z]+\" \%option { BEGIN IN_OPTION_LIST; } +\/\* { + definition_section_code_buffer_str += yytext; + source_state = IN_DEFINITION_SECTION; + source_buffer = &definition_section_code_buffer_str; + BEGIN IN_MULTILINE_COMMENT; + } . { yyerror("baaaa"); } @@ -96,7 +102,7 @@ value \"[-a-z]+\" { {rule_name} { - rules[yytext] = {}; + rules_map[yytext] = {}; } {ws}* { ; } \n { @@ -157,8 +163,8 @@ prefix={value} { } \<{rule_name}\>\{ { string state_name(yytext+1, yyleng-3); - current_state = rules.find(state_name); - if (current_state == rules.end()) { + current_state = rules_map.find(state_name); + if (current_state == rules_map.end()) { yyerror("State '%s' was never declared.", state_name.c_str()); } @@ -168,15 +174,21 @@ prefix={value} { BEGIN IN_STATE_DEFINITION; } . { - yyerror("baaa"); + yyerror("Rule section giberish (temp warning)."); } \n { ; } } { +\} { + BEGIN IN_RULE_SECTION; + } . { patter_buffer += yytext; } +\\. { + patter_buffer += yytext + 1; + } {wsnl}+\{ { BEGIN IN_CODE; nest_counter = 0; @@ -186,6 +198,7 @@ prefix={value} { { \{ { + code_buffer += yytext; ++nest_counter; } \} { @@ -196,16 +209,26 @@ prefix={value} { .code = strdup(code_buffer.c_str()), }); - BEGIN IN_RULE_SECTION; + patter_buffer = ""; + code_buffer = ""; + + BEGIN IN_STATE_DEFINITION; + } else { + code_buffer += yytext; } } \" { + code_buffer += yytext; BEGIN IN_STRING; } \/\/ { + code_buffer += yytext; BEGIN IN_COMMENT; } \/\* { + code_buffer += yytext; + source_state = IN_CODE; + source_buffer = &code_buffer; BEGIN IN_MULTILINE_COMMENT; } .|\n { @@ -214,20 +237,35 @@ prefix={value} { } { -\\\\ { ; } -\\\" { ; } -\" { BEGIN IN_CODE; } -.|\n { ; } /* XXX we are eating strings */ +\" { + code_buffer += yytext; + BEGIN IN_CODE; + } +\\\\ | +\\\" | +.|\n { + code_buffer += yytext; + } } { -. { ; } -\n { BEGIN IN_CODE; } +. { + code_buffer += yytext; + } +\n { + code_buffer += yytext; + BEGIN IN_CODE; + } } { -.|\n { ; } -\*\/ { BEGIN IN_CODE; } +.|\n { + *source_buffer += yytext; + } +\*\/ { + *source_buffer += yytext; + BEGIN source_state; + } } { @@ -238,12 +276,13 @@ prefix={value} { %% +#if DEBUG == 1 static void dump_parse_results(void) { puts(definition_section_code_buffer_str.c_str()); puts("----------"); - for (const auto &i : rules) { + for (const auto &i : rules_map) { printf("%s:\n", i.first.c_str()); for (const auto &h : i.second) { printf("\tpattern:\n%s\n" "\tcode:\n%s\n", h.pattern, h.code); @@ -255,35 +294,75 @@ void dump_parse_results(void) { puts(code_section_code_buffer_str.c_str()); } +static +void dump_rules(void) { + for (rule_t * rule = rules; rule->pattern != NULL; rule++) { + printf("{ .state = %d, .pattern = %s, }\n", + rule->state, + rule->pattern + ); + } + puts("{ .state = 0, .pattern = NULL, }"); +} +#else +static inline void dump_parse_results(void) { ; } +static inline void dump_rules(void) { ; } +#endif + extern "C" int parse(const char * filename) { + // Init int r = 0; FILE * f = fopen(filename, "r"); if (!f) { return 2; } yyin = f; + // Parse r = yylex(); if (r) { return r; } - dump_parse_results(); + // Set up globals + n_rules = 0; + for (const auto &rule_it : rules_map) { + n_rules += rule_it.second.size(); + } - patterns = (rule_t*)malloc(sizeof(rule_t)*(rules.size()+1)); + rules = (rule_t*)malloc(sizeof(rule_t)*(n_rules+1)); + rules[n_rules] = (rule_t) { 0, NULL, NULL }; - int i = 0; - for (const auto &rule_it : rules) { + int index = 0; + int state = 0; + for (const auto &rule_it : rules_map) { for (const auto &rule : rule_it.second) { - patterns[i++] = (rule_t) { - .state = i, + rules[index++] = (rule_t) { + .state = state, .pattern = rule.pattern, .code = rule.code, }; } + ++state; + } + + n_states = rules_map.size(); + state_names = (char**)malloc(sizeof(char*) * n_states); + int i = 0; + for (const auto &r : rules_map) { + state_names[i++] = strdup(r.first.c_str()); } - patterns[rules.size()] = (rule_t) { 0, NULL, NULL }; definition_section_code_buffer = strdup(definition_section_code_buffer_str.c_str()); code_section_code_buffer = strdup(code_section_code_buffer_str.c_str()); + // Debug + dump_parse_results(); + dump_rules(); + return r; } + +extern "C" +int deinit_parser(void) { + yylex_destroy(); + return 0; +} diff --git a/source/main.c b/source/main.c index 240b19c..09e3f8e 100644 --- a/source/main.c +++ b/source/main.c @@ -16,5 +16,8 @@ signed main(const int argc, char * argv[]) { parse(argv[1]); generate("jeger.yy.c"); + deinit_parser(); + deinit_jeger(); + return 0; } diff --git a/source/parse.h b/source/parse.h index a9409cb..e7ba41d 100644 --- a/source/parse.h +++ b/source/parse.h @@ -2,5 +2,6 @@ #define PARSE_H extern int parse(const char * filename); +extern int deinit_parser(void); #endif diff --git a/source/snippets.inc b/source/snippets.inc index 3d2e0c8..66bcf83 100644 --- a/source/snippets.inc +++ b/source/snippets.inc @@ -20,7 +20,11 @@ int mlookup(const char * s, int state) {\n\ const char * yy_lex_str_start = "\n\ int yylex(const char * s) {\n\ int state = 0;\n\ - for (const char * ss = s; *ss != '\\0'; ss += (mlen ? mlen : 1)) {\n\ + for (\n\ + const char * ss = s;\n\ + *ss != '\\0';\n\ + ss += ((mlen ? mlen : 1) * direction)\n\ + ) {\n\ int match = mlookup(ss, state_table[state]);\n\ if (match != NO_MATCH) {\n\ \n\ diff --git a/test/brainfuck.l b/test/brainfuck.l index f5bcbe3..88e130b 100644 --- a/test/brainfuck.l +++ b/test/brainfuck.l @@ -1,17 +1,19 @@ /* @BAKE - jeger --debug --trace -o $*.c $@ + #jeger --debug --trace -o $*.c $@ + jeger $@ gcc -o $* $*.c -ggdb @STOP */ %{ + #include char data[30000]; char * data_ptr = data; %} -%x IN_SKIP_FORWARD IN_SKIP_BACKWARD - -%option noyywrap nodefault +%x INITIAL IN_SKIP_FORWARD IN_SKIP_BACKWARD %% + +{ \> { ++data_ptr; } \< { --data_ptr; } \+ { ++(*data_ptr); } @@ -29,7 +31,7 @@ BEGIN IN_SKIP_BACKWARD; } } -.|\n { ; } +} { \] { BEGIN INITIAL; } @@ -39,7 +41,7 @@ \[ { REVERSE; BEGIN INITIAL; } } -{ +{ .|\n { ; } } %% @@ -50,13 +52,19 @@ signed main(int argc, char * argv[]) { return 1; } - yyin = fopen(argv[1], "r"); - if (!yyin) { - return 2; - } + FILE * yyin = fopen(argv[1], "r"); + if (!yyin) { return 2; } - yylex(); - yylex_destroy(); + fseek(yyin, 0, SEEK_END); + int yylen = ftell(yyin); + rewind(yyin); + char yystr[yylen+1]; + yystr[yylen] = '\00'; + fread(yystr, yylen, sizeof(char), yyin); + + yylex(yystr); + + fclose(yyin); return 0; }