diff --git a/source/generator.c b/source/generator.c index ae41518..0a8f463 100644 --- a/source/generator.c +++ b/source/generator.c @@ -1,123 +1,282 @@ -// @BAKE gcc -o $*.out $@ -ggdb #include +#include +#include -int had_seperation = 1; +#include "jeger.h" -#include "generated.h" +//#define AS_SYMBOL(c) (c-'a') +#define AS_SYMBOL(c) c +#define TOKEN_OFFSET 128 /* XXX */ -int mlen; +typedef struct { + int state; + const char * pattern; +} pattern_t; static inline -int mlookup(const char * s, int state) { - for (int i = 0; s[i] != '\0'; i++) { - state = table[state][AS_SYMBOL(s[i])]; - if (state == NO_MATCH) { - break; - } else - if (state > NO_MATCH) { - mlen = i+1; - return state; +void put_header(FILE * f, const int alphabet_size, const int n_states, const int no_match) { + fputs( + "#define AS_SYMBOL(c) c\n", // (c-'a')\n + f + ); + fprintf( + f, + "#define ALPHABET_SIZE %d\n", + alphabet_size + ); + fprintf( + f, + "#define N_STATES %d\n", + n_states + ); + fprintf( + f, + "#define NO_MATCH %d\n", + no_match + ); + + fputs("\n", f); +} + +static inline +void put_table(FILE * f, const int * table, char * * prefixes, int n_states, int alphabet_size) { + fputs("int table[N_STATES][ALPHABET_SIZE] = {\n", f); + for (int i = 0; i < n_states; i++) { + fprintf(f, "\t[%d] = {", i); + for (int h = 0; h < alphabet_size; h++) { + if (h == '\\') { + fprintf(f, "['\\\\'] = %d, ", table[i*alphabet_size + h]); + } else + if (h == '\'') { + fprintf(f, "['\\''] = %d, ", table[i*alphabet_size + h]); + } else + if (isprint(h)) { + fprintf(f, "['%c'] = %d, ", h, table[i*alphabet_size + h]); + } else { + fprintf(f, "[%d] = %d, ", h, table[i*alphabet_size + h]); + } + } + fprintf(f, "}, /* \"%s\" */\n", prefixes[i]); // XXX can break + } + fputs("};\n", f); +} + +void put_state_table(int * states, int n) { + puts("int state_table[] = {"); + for (int i = 0; i < n; i++) { + if (states[i] == -1) { break; } + printf("\t[%d] = %d,\n", i, states[i]); + } + puts("};"); +} + +int get_most_common_prefix(const char * pattern, char * * prefixes, int current_state_start) { + int r = current_state_start; + for (int i = current_state_start; prefixes[i] != NULL; i++) { + if (!strncmp(pattern, prefixes[i], strlen(prefixes[i]))) { + r = i; + } + } + return r; +} + +int get_max_number_of_states(const pattern_t * patterns) { + int r = 0; + int state_max_accumulator = -1; + for (int i = 0; patterns[i].pattern != NULL; i++) { + r += strlen(patterns[i].pattern); + if (patterns[i].state > state_max_accumulator) { + state_max_accumulator = patterns[i].state; + ++r; } } - mlen = 0; - return NO_MATCH; + return r; } -#define N_KEYWORDS 34 -#define N_SEPARATORS 10 -#define RETARDATION_OFFSET (NO_MATCH+N_KEYWORDS+N_SEPARATORS) +void generate(const pattern_t * patterns) { + // Init + int n_states = get_max_number_of_states(patterns); -#if 0 -# define TRACE fprintf(stderr, "--accepting rule at line %d (\"%.*s\")\n", __LINE__, mlen, ss); -# define TRACE_DEFAULT fprintf(stderr, "--accepting default rule (\"%c\")\n", *ss); -#else -# define TRACE -# define TRACE_DEFAULT -#endif + int states[n_states]; + INITIALIZE_ARRAY(states, n_states, -1); + states[0] = 0; -int mlex(const char * s) { - int state = 0; - for (const char * ss = s; *ss != '\0'; ss += (mlen ? mlen : 1)) { - int match = mlookup(ss, state_table[state]); - if (match != NO_MATCH) { - - } else { - + char * prefixes[n_states]; + INITIALIZE_ARRAY(prefixes, n_states, NULL); + + int table[n_states][alphabet_size]; + INITIALIZE_MATRIX(table, n_states, alphabet_size, TOKEN_OFFSET); + + // Construct table + int next_free_slot = 1; + for ( + int pattern_index = 0; + patterns[pattern_index].pattern != NULL; + pattern_index++ + ) { + const pattern_t * pattern = &patterns[pattern_index]; + + int current_state_start = states[pattern->state]; + if (current_state_start == -1) { + current_state_start = next_free_slot; + states[pattern->state] = next_free_slot; + ++next_free_slot; } - switch (match) { - case NO_MATCH: { - TRACE_DEFAULT; - putchar(*ss); - had_seperation = 0; - } break; - // keyword - case NO_MATCH+1 ... NO_MATCH+N_KEYWORDS: { - TRACE; - if (had_seperation) { - printf("\033[31m%.*s\033[0m", mlen, ss); - } else { - printf("%.*s", mlen, ss); + + int most_common_prefix_state = get_most_common_prefix( + pattern->pattern, + prefixes, + current_state_start + ); + + prefixes[current_state_start] = strdup(""); + + int most_common_prefix_index = strlen(prefixes[most_common_prefix_state]); + const char * last_char = pattern->pattern + most_common_prefix_index; + + table + [most_common_prefix_state] + [AS_SYMBOL(pattern->pattern[most_common_prefix_index])] + = next_free_slot + ; + + for ( + int i = most_common_prefix_index+1; + pattern->pattern[i] != '\0'; + i++, next_free_slot++ + ) { + table + [next_free_slot] + [AS_SYMBOL(pattern->pattern[i])] + = next_free_slot + 1 + ; + prefixes[next_free_slot] = strndup(pattern->pattern, i); + last_char = pattern->pattern + i; + } + + int last_position = (last_char == pattern->pattern + || most_common_prefix_index == last_char - pattern->pattern) + ? most_common_prefix_state + : next_free_slot-1 + ; + + table + [last_position] + [AS_SYMBOL(*last_char)] + = TOKEN_OFFSET+1 + pattern_index + ; + + put_table(stderr, (int*)table, prefixes, n_states, alphabet_size); + fputs("/* ================== */\n", stderr); + } + + /* `get_max_number_of_states()` most likely over estimated, + * so we cut back the table to the number of rows that were actually used. + */ + n_states = next_free_slot; + + // Output + put_header(stdout, alphabet_size, n_states, TOKEN_OFFSET); + put_table(stdout, (int*)table, prefixes, n_states, alphabet_size); + put_state_table(states, n_states); +} + +signed main(void) { + + generate(patterns); + + return 0; +} + +//pattern_t patterns[] = { +// {0, "while"}, +// {0, "printf"}, +// {0, "\""}, +// {1, "."}, +// {1, "\""}, +// {0, NULL} +//}; + +//pattern_t patterns[] = { +// {0, "auto"}, +// {0, "break"}, +// {0, "case"}, +// {0, "char"}, +// {0, "const"}, +// {0, "continue"}, +// {0, "default"}, +// {0, "do"}, +// {0, "double"}, +// {0, "else"}, +// {0, "enum"}, +// {0, "extern"}, +// {0, "float"}, +// {0, "for"}, +// {0, "goto"}, +// {0, "if"}, +// {0, "inline"}, +// {0, "int"}, +// {0, "long"}, +// {0, "register"}, +// {0, "return"}, +// {0, "restrict"}, +// {0, "short"}, +// {0, "signed"}, +// {0, "sizeof"}, +// {0, "static"}, +// {0, "struct"}, +// {0, "switch"}, +// {0, "typedef"}, +// {0, "union"}, +// {0, "unsigned"}, +// {0, "void"}, +// {0, "volatile"}, +// {0, "while"}, +// {0, " "}, +// {0, "\n"}, +// {0, "("}, +// {0, ")"}, +// {0, "{"}, +// {0, "}"}, +// {0, "["}, +// {0, "]"}, +// {0, ","}, +// {0, ";"}, +// {0, "\""}, +// {0, "/*"}, +// {0, "//"}, +// {1, "\\\""}, +// {1, "\""}, +// {2, "*/"}, +// {3, "\n"}, +// {0, NULL} +//}; + +//pattern_t patterns[] = { +// {0, "short"}, +// {0, "signed"}, +// {0, "sizeof"}, +// {0, "static"}, +// {0, "struct"}, +// {0, "switch"}, +// {0, NULL} +//}; + +/* + if (pattern->pattern[i] == '.') { + for (int col = 0; col < ALPHABET_SIZE; col++) { + table + [next_free_slot] + [col] + = next_free_slot + 1 + ; } - had_seperation = 0; - } break; - // Sep - case NO_MATCH+N_KEYWORDS+1 ... RETARDATION_OFFSET: { - TRACE; - printf("\033[35m%c\033[0m", *ss); - //putchar(*ss); - had_seperation = 1; - } break; - // string - case RETARDATION_OFFSET+1: { - TRACE; - state = 1; - printf("\033[32m\""); - } break; - case RETARDATION_OFFSET+5: { - TRACE; - state = 0; - printf("\"\033[0m"); - } break; - // comment (multiline) - case RETARDATION_OFFSET+2: { - TRACE; - state = 2; - printf("\033[34m/*"); - } break; - case RETARDATION_OFFSET+6: { - TRACE; - state = 0; - printf("*/\033[0m"); - had_seperation = 1; - } break; - // comment (single line) - case RETARDATION_OFFSET+3: { - TRACE; - state = 3; - printf("\033[34m//"); - } break; - case RETARDATION_OFFSET+7: { - TRACE; - state = 0; - printf("\033[0m\n"); - had_seperation = 1; - } break; - } - } - return 0; -} - -#ifdef MAIN_GENERATED_USER_MAIN - -extern const char * source_code; -#include "c_source_code_str.inc" - -signed main(void) { - //mlex("while (1) { printf(\"Heyo\"); }\n"); - mlex(source_code); - - return 0; -} - -#endif + } else { + table + [next_free_slot] + [AS_SYMBOL(pattern->pattern[i])] + = next_free_slot + 1 + ; + } +*/ diff --git a/source/legacy_generator.c b/source/legacy_generator.c new file mode 100644 index 0000000..ae41518 --- /dev/null +++ b/source/legacy_generator.c @@ -0,0 +1,123 @@ +// @BAKE gcc -o $*.out $@ -ggdb +#include + +int had_seperation = 1; + +#include "generated.h" + +int mlen; + +static inline +int mlookup(const char * s, int state) { + for (int i = 0; s[i] != '\0'; i++) { + state = table[state][AS_SYMBOL(s[i])]; + if (state == NO_MATCH) { + break; + } else + if (state > NO_MATCH) { + mlen = i+1; + return state; + } + } + + mlen = 0; + return NO_MATCH; +} + +#define N_KEYWORDS 34 +#define N_SEPARATORS 10 +#define RETARDATION_OFFSET (NO_MATCH+N_KEYWORDS+N_SEPARATORS) + +#if 0 +# define TRACE fprintf(stderr, "--accepting rule at line %d (\"%.*s\")\n", __LINE__, mlen, ss); +# define TRACE_DEFAULT fprintf(stderr, "--accepting default rule (\"%c\")\n", *ss); +#else +# define TRACE +# define TRACE_DEFAULT +#endif + +int mlex(const char * s) { + int state = 0; + for (const char * ss = s; *ss != '\0'; ss += (mlen ? mlen : 1)) { + int match = mlookup(ss, state_table[state]); + if (match != NO_MATCH) { + + } else { + + } + switch (match) { + case NO_MATCH: { + TRACE_DEFAULT; + putchar(*ss); + had_seperation = 0; + } break; + // keyword + case NO_MATCH+1 ... NO_MATCH+N_KEYWORDS: { + TRACE; + if (had_seperation) { + printf("\033[31m%.*s\033[0m", mlen, ss); + } else { + printf("%.*s", mlen, ss); + } + had_seperation = 0; + } break; + // Sep + case NO_MATCH+N_KEYWORDS+1 ... RETARDATION_OFFSET: { + TRACE; + printf("\033[35m%c\033[0m", *ss); + //putchar(*ss); + had_seperation = 1; + } break; + // string + case RETARDATION_OFFSET+1: { + TRACE; + state = 1; + printf("\033[32m\""); + } break; + case RETARDATION_OFFSET+5: { + TRACE; + state = 0; + printf("\"\033[0m"); + } break; + // comment (multiline) + case RETARDATION_OFFSET+2: { + TRACE; + state = 2; + printf("\033[34m/*"); + } break; + case RETARDATION_OFFSET+6: { + TRACE; + state = 0; + printf("*/\033[0m"); + had_seperation = 1; + } break; + // comment (single line) + case RETARDATION_OFFSET+3: { + TRACE; + state = 3; + printf("\033[34m//"); + } break; + case RETARDATION_OFFSET+7: { + TRACE; + state = 0; + printf("\033[0m\n"); + had_seperation = 1; + } break; + } + } + return 0; +} + +#ifdef MAIN_GENERATED_USER_MAIN + +extern const char * source_code; +#include "c_source_code_str.inc" + +signed main(void) { + //mlex("while (1) { printf(\"Heyo\"); }\n"); + mlex(source_code); + + return 0; +} + +#endif diff --git a/source/main_generator.c b/source/main_generator.c deleted file mode 100644 index 0a8f463..0000000 --- a/source/main_generator.c +++ /dev/null @@ -1,282 +0,0 @@ -#include -#include -#include - -#include "jeger.h" - -//#define AS_SYMBOL(c) (c-'a') -#define AS_SYMBOL(c) c -#define TOKEN_OFFSET 128 /* XXX */ - -typedef struct { - int state; - const char * pattern; -} pattern_t; - -static inline -void put_header(FILE * f, const int alphabet_size, const int n_states, const int no_match) { - fputs( - "#define AS_SYMBOL(c) c\n", // (c-'a')\n - f - ); - fprintf( - f, - "#define ALPHABET_SIZE %d\n", - alphabet_size - ); - fprintf( - f, - "#define N_STATES %d\n", - n_states - ); - fprintf( - f, - "#define NO_MATCH %d\n", - no_match - ); - - fputs("\n", f); -} - -static inline -void put_table(FILE * f, const int * table, char * * prefixes, int n_states, int alphabet_size) { - fputs("int table[N_STATES][ALPHABET_SIZE] = {\n", f); - for (int i = 0; i < n_states; i++) { - fprintf(f, "\t[%d] = {", i); - for (int h = 0; h < alphabet_size; h++) { - if (h == '\\') { - fprintf(f, "['\\\\'] = %d, ", table[i*alphabet_size + h]); - } else - if (h == '\'') { - fprintf(f, "['\\''] = %d, ", table[i*alphabet_size + h]); - } else - if (isprint(h)) { - fprintf(f, "['%c'] = %d, ", h, table[i*alphabet_size + h]); - } else { - fprintf(f, "[%d] = %d, ", h, table[i*alphabet_size + h]); - } - } - fprintf(f, "}, /* \"%s\" */\n", prefixes[i]); // XXX can break - } - fputs("};\n", f); -} - -void put_state_table(int * states, int n) { - puts("int state_table[] = {"); - for (int i = 0; i < n; i++) { - if (states[i] == -1) { break; } - printf("\t[%d] = %d,\n", i, states[i]); - } - puts("};"); -} - -int get_most_common_prefix(const char * pattern, char * * prefixes, int current_state_start) { - int r = current_state_start; - for (int i = current_state_start; prefixes[i] != NULL; i++) { - if (!strncmp(pattern, prefixes[i], strlen(prefixes[i]))) { - r = i; - } - } - return r; -} - -int get_max_number_of_states(const pattern_t * patterns) { - int r = 0; - int state_max_accumulator = -1; - for (int i = 0; patterns[i].pattern != NULL; i++) { - r += strlen(patterns[i].pattern); - if (patterns[i].state > state_max_accumulator) { - state_max_accumulator = patterns[i].state; - ++r; - } - } - - return r; -} - -void generate(const pattern_t * patterns) { - // Init - int n_states = get_max_number_of_states(patterns); - - int states[n_states]; - INITIALIZE_ARRAY(states, n_states, -1); - states[0] = 0; - - char * prefixes[n_states]; - INITIALIZE_ARRAY(prefixes, n_states, NULL); - - int table[n_states][alphabet_size]; - INITIALIZE_MATRIX(table, n_states, alphabet_size, TOKEN_OFFSET); - - // Construct table - int next_free_slot = 1; - for ( - int pattern_index = 0; - patterns[pattern_index].pattern != NULL; - pattern_index++ - ) { - const pattern_t * pattern = &patterns[pattern_index]; - - int current_state_start = states[pattern->state]; - if (current_state_start == -1) { - current_state_start = next_free_slot; - states[pattern->state] = next_free_slot; - ++next_free_slot; - } - - int most_common_prefix_state = get_most_common_prefix( - pattern->pattern, - prefixes, - current_state_start - ); - - prefixes[current_state_start] = strdup(""); - - int most_common_prefix_index = strlen(prefixes[most_common_prefix_state]); - const char * last_char = pattern->pattern + most_common_prefix_index; - - table - [most_common_prefix_state] - [AS_SYMBOL(pattern->pattern[most_common_prefix_index])] - = next_free_slot - ; - - for ( - int i = most_common_prefix_index+1; - pattern->pattern[i] != '\0'; - i++, next_free_slot++ - ) { - table - [next_free_slot] - [AS_SYMBOL(pattern->pattern[i])] - = next_free_slot + 1 - ; - prefixes[next_free_slot] = strndup(pattern->pattern, i); - last_char = pattern->pattern + i; - } - - int last_position = (last_char == pattern->pattern - || most_common_prefix_index == last_char - pattern->pattern) - ? most_common_prefix_state - : next_free_slot-1 - ; - - table - [last_position] - [AS_SYMBOL(*last_char)] - = TOKEN_OFFSET+1 + pattern_index - ; - - put_table(stderr, (int*)table, prefixes, n_states, alphabet_size); - fputs("/* ================== */\n", stderr); - } - - /* `get_max_number_of_states()` most likely over estimated, - * so we cut back the table to the number of rows that were actually used. - */ - n_states = next_free_slot; - - // Output - put_header(stdout, alphabet_size, n_states, TOKEN_OFFSET); - put_table(stdout, (int*)table, prefixes, n_states, alphabet_size); - put_state_table(states, n_states); -} - -signed main(void) { - - generate(patterns); - - return 0; -} - -//pattern_t patterns[] = { -// {0, "while"}, -// {0, "printf"}, -// {0, "\""}, -// {1, "."}, -// {1, "\""}, -// {0, NULL} -//}; - -//pattern_t patterns[] = { -// {0, "auto"}, -// {0, "break"}, -// {0, "case"}, -// {0, "char"}, -// {0, "const"}, -// {0, "continue"}, -// {0, "default"}, -// {0, "do"}, -// {0, "double"}, -// {0, "else"}, -// {0, "enum"}, -// {0, "extern"}, -// {0, "float"}, -// {0, "for"}, -// {0, "goto"}, -// {0, "if"}, -// {0, "inline"}, -// {0, "int"}, -// {0, "long"}, -// {0, "register"}, -// {0, "return"}, -// {0, "restrict"}, -// {0, "short"}, -// {0, "signed"}, -// {0, "sizeof"}, -// {0, "static"}, -// {0, "struct"}, -// {0, "switch"}, -// {0, "typedef"}, -// {0, "union"}, -// {0, "unsigned"}, -// {0, "void"}, -// {0, "volatile"}, -// {0, "while"}, -// {0, " "}, -// {0, "\n"}, -// {0, "("}, -// {0, ")"}, -// {0, "{"}, -// {0, "}"}, -// {0, "["}, -// {0, "]"}, -// {0, ","}, -// {0, ";"}, -// {0, "\""}, -// {0, "/*"}, -// {0, "//"}, -// {1, "\\\""}, -// {1, "\""}, -// {2, "*/"}, -// {3, "\n"}, -// {0, NULL} -//}; - -//pattern_t patterns[] = { -// {0, "short"}, -// {0, "signed"}, -// {0, "sizeof"}, -// {0, "static"}, -// {0, "struct"}, -// {0, "switch"}, -// {0, NULL} -//}; - -/* - if (pattern->pattern[i] == '.') { - for (int col = 0; col < ALPHABET_SIZE; col++) { - table - [next_free_slot] - [col] - = next_free_slot + 1 - ; - } - } else { - table - [next_free_slot] - [AS_SYMBOL(pattern->pattern[i])] - = next_free_slot + 1 - ; - } -*/