diff --git a/source/hl.h b/source/hl.h index 8735177..e88f265 100644 --- a/source/hl.h +++ b/source/hl.h @@ -71,10 +71,11 @@ extern token_t * new_token(const char * const word, // TODO: ALIGN PROPERLY... -extern int token_fits(const token_t * const token, - const char * const to, +extern int token_fits(const token_t * const token, + const char * const to, const int string_offset, - int * match_offset); + const bool is_start_of_line, + int * match_offset); extern void render_string(const char * const string, const char * const mode); @@ -232,6 +233,11 @@ token_t * new_token(const char * const word, return new_keyword_token(word, g); } case MATCH: { + token_t * mt = (token_t*)malloc(sizeof(token_t)); + mt->hl = g; + mt->t = MATCH; + mt->syntax = regex_compile(word); + append_token(mt); } break; case REGION: { } break; @@ -244,14 +250,15 @@ token_t * new_token(const char * const word, // ### Highlighting ### // -------------------- -int token_fits(const token_t * const token, - const char * const to, - const int string_offset, +int token_fits(const token_t * const token, + const char * const to, + const int string_offset, + const bool is_start_of_line, int * match_offset) { UNUSED(match_offset); //return regex_match(pattern, to, string_offset, match_offset); - return regex_match(token->syntax, to + string_offset); + return regex_match(token->syntax, to, is_start_of_line, string_offset); } void render_string(const char * const string, @@ -264,7 +271,8 @@ void render_string(const char * const string, for (; token_index < token_table.element_count; token_index++) { token_t * t = *(token_t**)vector_get(&token_table, token_index); - f = token_fits(t, string, (int) (s - string), &offset); + const bool is_start_of_line = (s == string) || (*s == '\n'); + f = token_fits(t, string, (int)(s - string), is_start_of_line, &offset); if (f) { break; } diff --git a/source/regex.c b/source/regex.c index 9d01efa..3784311 100644 --- a/source/regex.c +++ b/source/regex.c @@ -24,7 +24,7 @@ bool is_magic(const char c) { if (is_quantifier(c)) { return true; } - for (const char * s = "\\[]."; *s != '\00'; s++) { + for (const char * s = "\\[].^"; *s != '\00'; s++) { if (*s == c) { return true; } @@ -47,11 +47,18 @@ typedef struct { typedef struct { int in; int to; + int width; } offshoot_t; typedef struct { bool * do_catch; bool * is_negative; +// these might be obsolite but im leaving them for now + bool * do_loop_hook; + bool * do_follow_hook; + bool * do_loop_shoot; + bool * do_follow_shoot; +// --- int * state; int * width; char * whitelist; @@ -88,6 +95,9 @@ static int escape_1_to_1(const char c, compiler_state * cs) { case '.': { strcat(target_list, "."); } return 1; + case '^': { + strcat(target_list, "^"); + } return 1; case '=': { strcat(target_list, "="); } return 1; @@ -365,7 +375,7 @@ void HOOK_ALL( int from, int to, compiler_state * cs) { - int hook_to = (to == HALT_AND_CATCH_FIRE) ? -1 : ((*cs->state) + to); + int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to); for (const char * s = str; *s != '\0'; s++) { @@ -379,26 +389,39 @@ void HOOK_ALL( int from, } } -void OFFSHOOT(int from, - int to, - compiler_state * cs) { +void ABSOLUTE_OFFSHOOT(int from, + int to, + int width, + compiler_state * cs) { offshoot_t * offshoot = malloc(sizeof(offshoot_t)); - offshoot->in = *cs->state + from; - offshoot->to = *cs->state + to; + offshoot->in = from; + offshoot->to = to; + offshoot->width = width; vector_push(&cs->regex->catch_table, &offshoot); } +void OFFSHOOT(int from, + int to, + int width, + compiler_state * cs) { + ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs); +} + regex_t * regex_compile(const char * const pattern) { regex_t * regex = (regex_t *)malloc(sizeof(regex_t)); regex->str = strdup(pattern); vector_init(®ex->delta_table, sizeof(delta_t*), 0UL); vector_init(®ex->catch_table, sizeof(offshoot_t*), 0UL); - int state = 0; + int state = 2; bool do_catch; bool is_negative; + bool do_loop_hook; + bool do_follow_hook; + bool do_loop_shoot; + bool do_follow_shoot; int width; char whitelist[64]; char blacklist[64]; @@ -416,14 +439,32 @@ regex_t * regex_compile(const char * const pattern) { for (const char * s = pattern; *s != '\00';) { // Reset the compiler assert(!is_quantifier(*pattern) && "Pattern starts with quantifier."); - whitelist[0] = '\00'; - blacklist[0] = '\00'; - do_catch = false; - is_negative = false; + whitelist[0] = '\0'; + blacklist[0] = '\0'; + do_catch = false; + is_negative = false; + do_loop_hook = false; + do_follow_hook = false; + do_loop_shoot = false; + do_follow_shoot = false; width = 1; // Translate char switch (*s) { + case '^': { + if (s == pattern) { + ABSOLUTE_OFFSHOOT(0, 2, 0, &cs); + ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs); + } + whitelist[0] = '\n'; + whitelist[1] = '\0'; + HOOK_ALL(0, whitelist, 0, &cs); + if (s != pattern) { + state += 1; + } + s += 1; + goto long_continue; + } break; case '.': { compile_dot(&cs); } break; @@ -435,8 +476,8 @@ regex_t * regex_compile(const char * const pattern) { s += compile_range(s, &cs) - 1; } break; default: { - whitelist[0] = *s; - whitelist[1] = '\00'; + whitelist[0] = *s; + whitelist[1] = '\0'; } break; } @@ -446,37 +487,38 @@ regex_t * regex_compile(const char * const pattern) { switch (*s) { case '=': case '?': { + do_loop_hook = true; HOOK_ALL(0, whitelist, +1, &cs); if (do_catch || is_negative) { - OFFSHOOT(0, +1, &cs); + OFFSHOOT(0, +1, 1, &cs); } s += 1; } break; case '*': { HOOK_ALL(0, whitelist, 0, &cs); if (do_catch) { - OFFSHOOT(0, +1, &cs); + OFFSHOOT(0, +1, 1, &cs); } else if (is_negative) { - OFFSHOOT(0, 0, &cs); + OFFSHOOT(0, 0, 1, &cs); } s += 1; } break; case '+': { HOOK_ALL(0, whitelist, +1, &cs); if (do_catch || is_negative) { - OFFSHOOT(0, +1, &cs); + OFFSHOOT(0, +1, 1, &cs); } state += 1; HOOK_ALL(0, whitelist, 0, &cs); if (do_catch || is_negative) { - OFFSHOOT(0, 0, &cs); + OFFSHOOT(0, 0, 1, &cs); } s += 1; } break; default: { // Literal HOOK_ALL(0, whitelist, +1, &cs); if (do_catch || is_negative) { - OFFSHOOT(0, +1, &cs); + OFFSHOOT(0, +1, 1, &cs); } state += 1; } break; @@ -489,6 +531,7 @@ regex_t * regex_compile(const char * const pattern) { filter_blacklist(whitelist, blacklist, filtered_blacklist); HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs); } + long_continue: } regex->accepting_state = state; @@ -509,37 +552,40 @@ int regex_free(regex_t * const regex) { // ----------------- // ### Searching ### // ----------------- -static bool catch_(const regex_t * const regex, +static int catch_(const regex_t * const regex, int * const state) { for (size_t i = 0; i < regex->catch_table.element_count; i++){ const offshoot_t * const offshoot = *(offshoot_t**)vector_get(®ex->catch_table, i); if (offshoot->in == *state) { *state = offshoot->to; - return true; + return offshoot->width; } } - return false; + return HALT_AND_CATCH_FIRE; } -static int regex_assert(const regex_t * const regex, - const char * const string, - int state, - int width) { - for (const char * s = string; *s != '\00'; s++) { +static int regex_assert(const regex_t * const regex, + const char * const string, + const int string_offset, + int state, + int width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s + for (const char * s = (string + string_offset); *s != '\00';) { // delta for (size_t i = 0; i < regex->delta_table.element_count; i++) { const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i); if ((delta->in == state) && (delta->input == *s)) { - int r = regex_assert(regex, s + delta->width, delta->to, width + 1); + int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, width + 1); if(r){ return r; } } } - if (catch_(regex, &state)) { - width += 1; + const int catch_width = catch_(regex, &state); + if ((catch_width != HALT_AND_CATCH_FIRE) + && (state != HALT_AND_CATCH_FIRE)) { + s += catch_width; continue; } @@ -549,8 +595,10 @@ static int regex_assert(const regex_t * const regex, return false; } -int regex_match( regex_t * regex, - const char * const string) { +int regex_match( regex_t * regex, + const char * const string, + const bool is_start_of_string, + const int string_offset) { // XXX: remove this useless piece of shit of a parameter nigger if (regex == NULL) { return false; } @@ -558,11 +606,13 @@ int regex_match( regex_t * regex, return true; } - return regex_assert(regex, string, 0, 0); + const int initial_state = (int)(!is_start_of_string); + + return regex_assert(regex, string, string_offset, initial_state, 0); } bool regex_search( regex_t * regex, const char * const string) { - return (bool)regex_match(regex, string); + return (bool)regex_match(regex, string, true, 0); } diff --git a/source/regex.h b/source/regex.h index 0049fcc..f35670d 100644 --- a/source/regex.h +++ b/source/regex.h @@ -16,7 +16,7 @@ typedef struct { extern regex_t * regex_compile(const char * const pattern); extern int regex_free(regex_t * const regex); extern bool regex_search(regex_t * regex, const char * const string); -extern int regex_match(regex_t * regex, const char * const string); +extern int regex_match(regex_t * regex, const char * const string, const bool start_of_string, const int string_offset); extern bool is_magic(const char c); diff --git a/tests/carrot.input b/tests/carrot.input new file mode 100644 index 0000000..f9dcfc9 --- /dev/null +++ b/tests/carrot.input @@ -0,0 +1,8 @@ +^ +^ ^ + ^ ^ ^^ + ^ ^^ ^3^ ^ +^ +^ + ^ +^