initially renamed the wrong file
This commit is contained in:
@ -1,123 +1,282 @@
|
||||
// @BAKE gcc -o $*.out $@ -ggdb
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
int had_seperation = 1;
|
||||
#include "jeger.h"
|
||||
|
||||
#include "generated.h"
|
||||
//#define AS_SYMBOL(c) (c-'a')
|
||||
#define AS_SYMBOL(c) c
|
||||
#define TOKEN_OFFSET 128 /* XXX */
|
||||
|
||||
int mlen;
|
||||
typedef struct {
|
||||
int state;
|
||||
const char * pattern;
|
||||
} pattern_t;
|
||||
|
||||
static inline
|
||||
int mlookup(const char * s, int state) {
|
||||
for (int i = 0; s[i] != '\0'; i++) {
|
||||
state = table[state][AS_SYMBOL(s[i])];
|
||||
if (state == NO_MATCH) {
|
||||
break;
|
||||
} else
|
||||
if (state > NO_MATCH) {
|
||||
mlen = i+1;
|
||||
return state;
|
||||
void put_header(FILE * f, const int alphabet_size, const int n_states, const int no_match) {
|
||||
fputs(
|
||||
"#define AS_SYMBOL(c) c\n", // (c-'a')\n
|
||||
f
|
||||
);
|
||||
fprintf(
|
||||
f,
|
||||
"#define ALPHABET_SIZE %d\n",
|
||||
alphabet_size
|
||||
);
|
||||
fprintf(
|
||||
f,
|
||||
"#define N_STATES %d\n",
|
||||
n_states
|
||||
);
|
||||
fprintf(
|
||||
f,
|
||||
"#define NO_MATCH %d\n",
|
||||
no_match
|
||||
);
|
||||
|
||||
fputs("\n", f);
|
||||
}
|
||||
|
||||
static inline
|
||||
void put_table(FILE * f, const int * table, char * * prefixes, int n_states, int alphabet_size) {
|
||||
fputs("int table[N_STATES][ALPHABET_SIZE] = {\n", f);
|
||||
for (int i = 0; i < n_states; i++) {
|
||||
fprintf(f, "\t[%d] = {", i);
|
||||
for (int h = 0; h < alphabet_size; h++) {
|
||||
if (h == '\\') {
|
||||
fprintf(f, "['\\\\'] = %d, ", table[i*alphabet_size + h]);
|
||||
} else
|
||||
if (h == '\'') {
|
||||
fprintf(f, "['\\''] = %d, ", table[i*alphabet_size + h]);
|
||||
} else
|
||||
if (isprint(h)) {
|
||||
fprintf(f, "['%c'] = %d, ", h, table[i*alphabet_size + h]);
|
||||
} else {
|
||||
fprintf(f, "[%d] = %d, ", h, table[i*alphabet_size + h]);
|
||||
}
|
||||
}
|
||||
fprintf(f, "}, /* \"%s\" */\n", prefixes[i]); // XXX can break
|
||||
}
|
||||
fputs("};\n", f);
|
||||
}
|
||||
|
||||
void put_state_table(int * states, int n) {
|
||||
puts("int state_table[] = {");
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (states[i] == -1) { break; }
|
||||
printf("\t[%d] = %d,\n", i, states[i]);
|
||||
}
|
||||
puts("};");
|
||||
}
|
||||
|
||||
int get_most_common_prefix(const char * pattern, char * * prefixes, int current_state_start) {
|
||||
int r = current_state_start;
|
||||
for (int i = current_state_start; prefixes[i] != NULL; i++) {
|
||||
if (!strncmp(pattern, prefixes[i], strlen(prefixes[i]))) {
|
||||
r = i;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
int get_max_number_of_states(const pattern_t * patterns) {
|
||||
int r = 0;
|
||||
int state_max_accumulator = -1;
|
||||
for (int i = 0; patterns[i].pattern != NULL; i++) {
|
||||
r += strlen(patterns[i].pattern);
|
||||
if (patterns[i].state > state_max_accumulator) {
|
||||
state_max_accumulator = patterns[i].state;
|
||||
++r;
|
||||
}
|
||||
}
|
||||
|
||||
mlen = 0;
|
||||
return NO_MATCH;
|
||||
return r;
|
||||
}
|
||||
|
||||
#define N_KEYWORDS 34
|
||||
#define N_SEPARATORS 10
|
||||
#define RETARDATION_OFFSET (NO_MATCH+N_KEYWORDS+N_SEPARATORS)
|
||||
void generate(const pattern_t * patterns) {
|
||||
// Init
|
||||
int n_states = get_max_number_of_states(patterns);
|
||||
|
||||
#if 0
|
||||
# define TRACE fprintf(stderr, "--accepting rule at line %d (\"%.*s\")\n", __LINE__, mlen, ss);
|
||||
# define TRACE_DEFAULT fprintf(stderr, "--accepting default rule (\"%c\")\n", *ss);
|
||||
#else
|
||||
# define TRACE
|
||||
# define TRACE_DEFAULT
|
||||
#endif
|
||||
int states[n_states];
|
||||
INITIALIZE_ARRAY(states, n_states, -1);
|
||||
states[0] = 0;
|
||||
|
||||
int mlex(const char * s) {
|
||||
int state = 0;
|
||||
for (const char * ss = s; *ss != '\0'; ss += (mlen ? mlen : 1)) {
|
||||
int match = mlookup(ss, state_table[state]);
|
||||
if (match != NO_MATCH) {
|
||||
|
||||
} else {
|
||||
|
||||
char * prefixes[n_states];
|
||||
INITIALIZE_ARRAY(prefixes, n_states, NULL);
|
||||
|
||||
int table[n_states][alphabet_size];
|
||||
INITIALIZE_MATRIX(table, n_states, alphabet_size, TOKEN_OFFSET);
|
||||
|
||||
// Construct table
|
||||
int next_free_slot = 1;
|
||||
for (
|
||||
int pattern_index = 0;
|
||||
patterns[pattern_index].pattern != NULL;
|
||||
pattern_index++
|
||||
) {
|
||||
const pattern_t * pattern = &patterns[pattern_index];
|
||||
|
||||
int current_state_start = states[pattern->state];
|
||||
if (current_state_start == -1) {
|
||||
current_state_start = next_free_slot;
|
||||
states[pattern->state] = next_free_slot;
|
||||
++next_free_slot;
|
||||
}
|
||||
switch (match) {
|
||||
case NO_MATCH: {
|
||||
TRACE_DEFAULT;
|
||||
putchar(*ss);
|
||||
had_seperation = 0;
|
||||
} break;
|
||||
// keyword
|
||||
case NO_MATCH+1 ... NO_MATCH+N_KEYWORDS: {
|
||||
TRACE;
|
||||
if (had_seperation) {
|
||||
printf("\033[31m%.*s\033[0m", mlen, ss);
|
||||
} else {
|
||||
printf("%.*s", mlen, ss);
|
||||
|
||||
int most_common_prefix_state = get_most_common_prefix(
|
||||
pattern->pattern,
|
||||
prefixes,
|
||||
current_state_start
|
||||
);
|
||||
|
||||
prefixes[current_state_start] = strdup("");
|
||||
|
||||
int most_common_prefix_index = strlen(prefixes[most_common_prefix_state]);
|
||||
const char * last_char = pattern->pattern + most_common_prefix_index;
|
||||
|
||||
table
|
||||
[most_common_prefix_state]
|
||||
[AS_SYMBOL(pattern->pattern[most_common_prefix_index])]
|
||||
= next_free_slot
|
||||
;
|
||||
|
||||
for (
|
||||
int i = most_common_prefix_index+1;
|
||||
pattern->pattern[i] != '\0';
|
||||
i++, next_free_slot++
|
||||
) {
|
||||
table
|
||||
[next_free_slot]
|
||||
[AS_SYMBOL(pattern->pattern[i])]
|
||||
= next_free_slot + 1
|
||||
;
|
||||
prefixes[next_free_slot] = strndup(pattern->pattern, i);
|
||||
last_char = pattern->pattern + i;
|
||||
}
|
||||
|
||||
int last_position = (last_char == pattern->pattern
|
||||
|| most_common_prefix_index == last_char - pattern->pattern)
|
||||
? most_common_prefix_state
|
||||
: next_free_slot-1
|
||||
;
|
||||
|
||||
table
|
||||
[last_position]
|
||||
[AS_SYMBOL(*last_char)]
|
||||
= TOKEN_OFFSET+1 + pattern_index
|
||||
;
|
||||
|
||||
put_table(stderr, (int*)table, prefixes, n_states, alphabet_size);
|
||||
fputs("/* ================== */\n", stderr);
|
||||
}
|
||||
|
||||
/* `get_max_number_of_states()` most likely over estimated,
|
||||
* so we cut back the table to the number of rows that were actually used.
|
||||
*/
|
||||
n_states = next_free_slot;
|
||||
|
||||
// Output
|
||||
put_header(stdout, alphabet_size, n_states, TOKEN_OFFSET);
|
||||
put_table(stdout, (int*)table, prefixes, n_states, alphabet_size);
|
||||
put_state_table(states, n_states);
|
||||
}
|
||||
|
||||
signed main(void) {
|
||||
|
||||
generate(patterns);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//pattern_t patterns[] = {
|
||||
// {0, "while"},
|
||||
// {0, "printf"},
|
||||
// {0, "\""},
|
||||
// {1, "."},
|
||||
// {1, "\""},
|
||||
// {0, NULL}
|
||||
//};
|
||||
|
||||
//pattern_t patterns[] = {
|
||||
// {0, "auto"},
|
||||
// {0, "break"},
|
||||
// {0, "case"},
|
||||
// {0, "char"},
|
||||
// {0, "const"},
|
||||
// {0, "continue"},
|
||||
// {0, "default"},
|
||||
// {0, "do"},
|
||||
// {0, "double"},
|
||||
// {0, "else"},
|
||||
// {0, "enum"},
|
||||
// {0, "extern"},
|
||||
// {0, "float"},
|
||||
// {0, "for"},
|
||||
// {0, "goto"},
|
||||
// {0, "if"},
|
||||
// {0, "inline"},
|
||||
// {0, "int"},
|
||||
// {0, "long"},
|
||||
// {0, "register"},
|
||||
// {0, "return"},
|
||||
// {0, "restrict"},
|
||||
// {0, "short"},
|
||||
// {0, "signed"},
|
||||
// {0, "sizeof"},
|
||||
// {0, "static"},
|
||||
// {0, "struct"},
|
||||
// {0, "switch"},
|
||||
// {0, "typedef"},
|
||||
// {0, "union"},
|
||||
// {0, "unsigned"},
|
||||
// {0, "void"},
|
||||
// {0, "volatile"},
|
||||
// {0, "while"},
|
||||
// {0, " "},
|
||||
// {0, "\n"},
|
||||
// {0, "("},
|
||||
// {0, ")"},
|
||||
// {0, "{"},
|
||||
// {0, "}"},
|
||||
// {0, "["},
|
||||
// {0, "]"},
|
||||
// {0, ","},
|
||||
// {0, ";"},
|
||||
// {0, "\""},
|
||||
// {0, "/*"},
|
||||
// {0, "//"},
|
||||
// {1, "\\\""},
|
||||
// {1, "\""},
|
||||
// {2, "*/"},
|
||||
// {3, "\n"},
|
||||
// {0, NULL}
|
||||
//};
|
||||
|
||||
//pattern_t patterns[] = {
|
||||
// {0, "short"},
|
||||
// {0, "signed"},
|
||||
// {0, "sizeof"},
|
||||
// {0, "static"},
|
||||
// {0, "struct"},
|
||||
// {0, "switch"},
|
||||
// {0, NULL}
|
||||
//};
|
||||
|
||||
/*
|
||||
if (pattern->pattern[i] == '.') {
|
||||
for (int col = 0; col < ALPHABET_SIZE; col++) {
|
||||
table
|
||||
[next_free_slot]
|
||||
[col]
|
||||
= next_free_slot + 1
|
||||
;
|
||||
}
|
||||
had_seperation = 0;
|
||||
} break;
|
||||
// Sep
|
||||
case NO_MATCH+N_KEYWORDS+1 ... RETARDATION_OFFSET: {
|
||||
TRACE;
|
||||
printf("\033[35m%c\033[0m", *ss);
|
||||
//putchar(*ss);
|
||||
had_seperation = 1;
|
||||
} break;
|
||||
// string
|
||||
case RETARDATION_OFFSET+1: {
|
||||
TRACE;
|
||||
state = 1;
|
||||
printf("\033[32m\"");
|
||||
} break;
|
||||
case RETARDATION_OFFSET+5: {
|
||||
TRACE;
|
||||
state = 0;
|
||||
printf("\"\033[0m");
|
||||
} break;
|
||||
// comment (multiline)
|
||||
case RETARDATION_OFFSET+2: {
|
||||
TRACE;
|
||||
state = 2;
|
||||
printf("\033[34m/*");
|
||||
} break;
|
||||
case RETARDATION_OFFSET+6: {
|
||||
TRACE;
|
||||
state = 0;
|
||||
printf("*/\033[0m");
|
||||
had_seperation = 1;
|
||||
} break;
|
||||
// comment (single line)
|
||||
case RETARDATION_OFFSET+3: {
|
||||
TRACE;
|
||||
state = 3;
|
||||
printf("\033[34m//");
|
||||
} break;
|
||||
case RETARDATION_OFFSET+7: {
|
||||
TRACE;
|
||||
state = 0;
|
||||
printf("\033[0m\n");
|
||||
had_seperation = 1;
|
||||
} break;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef MAIN_GENERATED_USER_MAIN
|
||||
|
||||
extern const char * source_code;
|
||||
#include "c_source_code_str.inc"
|
||||
|
||||
signed main(void) {
|
||||
//mlex("while (1) { printf(\"Heyo\"); }\n");
|
||||
mlex(source_code);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
} else {
|
||||
table
|
||||
[next_free_slot]
|
||||
[AS_SYMBOL(pattern->pattern[i])]
|
||||
= next_free_slot + 1
|
||||
;
|
||||
}
|
||||
*/
|
||||
|
Reference in New Issue
Block a user