--- /dev/null
+#if __cplusplus
+# pragma GCC diagnostic ignored "-Wc++20-extensions"
+#endif
+
+#include "jeger.h"
+
+#include <assert.h>
+#include <string.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#define JEGER_INIT_STATE 2
+
+// ------------------
+// ### Char tests ###
+// ------------------
+static inline
+bool mystrchr(const char * const str, const char c){
+ for (const char * s = str; *s != '\00'; s++) {
+ if (*s == c) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline
+bool is_quantifier(const char c) {
+ return mystrchr("=?+*", c);
+}
+
+static inline
+bool is_hologram_escape(const char c) {
+ return mystrchr("<>", c);
+}
+
+bool is_magic(const char c) {
+ return is_quantifier(c)
+ || mystrchr("\\[].^", c)
+ ;
+}
+
+// -----------------
+// ### Char sets ###
+// -----------------
+#define JEGER_CHAR_SET_at "@"
+#define JEGER_CHAR_SET_underscore "_"
+#define JEGER_CHAR_SET_lower "abcdefghijklmnopqrstuwxyz"
+#define JEGER_CHAR_SET_upper "ABCDEFGHIJKLMNOPQRSTUWXYZ"
+#define JEGER_CHAR_SET_digits "0123456789"
+#define JEGER_CHAR_SET_octal_digits "01234567"
+#define JEGER_CHAR_SET_lower_hex "abcdef"
+#define JEGER_CHAR_SET_upper_hex "ABCDEF"
+#define JEGER_CHAR_SET_oct_241_to_277 \
+ "\241\242\243\244\245" \
+ "\246\247\250\251\252" \
+ "\253\254\255\256\257" \
+ "\260\261\262\263\264" \
+ "\265\266\267\270\271" \
+ "\272\273\274\275\276" \
+ "\277"
+#define JEGER_CHAR_SET_oct_300_to_337 \
+ "\300\301\302\303\304" \
+ "\305\306\307\310\311" \
+ "\312\313\314\315\316" \
+ "\317\320\321\322\323" \
+ "\324\325\326\327\330" \
+ "\331\332\333\334\335" \
+ "\336\337"
+#define JEGER_CHAR_SET_file_extra "/.-_+,#$%~="
+#define JEGER_CHAR_SET_whitespace " \t\v\n"
+
+static const char JEGER_CHAR_very_word_chars[] =
+ JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_lower
+ JEGER_CHAR_SET_upper
+ ;
+
+// ----------------------
+// ### Internal Types ###
+// ----------------------
+typedef struct {
+ int in;
+ char input;
+ int to;
+ int pattern_width;
+ int match_width;
+} delta_t;
+
+typedef struct {
+ int in;
+ int to;
+ int pattern_width;
+ int match_width;
+} offshoot_t;
+
+enum {
+ DO_CATCH = 0x00000001 << 0,
+ IS_NEGATIVE = 0x00000001 << 1,
+ IS_AT_THE_BEGINNING = 0x00000001 << 2,
+ FORCE_START_OF_STRING = 0x00000001 << 3,
+ INCREMENT_STATE = 0x00000001 << 4,
+};
+
+typedef struct {
+ int flags;
+ int state;
+ int width;
+ char * whitelist;
+ char * blacklist;
+} compiler_state;
+
+
+
+// ----------------------------------
+// ### Regex creation/destruction ###
+// ----------------------------------
+static const int HALT_AND_CATCH_FIRE = INT_MIN;
+
+#define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
+
+static
+void HOOK_ALL(const int from,
+ const char * const str,
+ const int to,
+ const compiler_state * const cs,
+ regex_t * regex) {
+ for (const char * s = str; *s != '\0'; s++) {
+ delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
+ *delta = (delta_t){
+ .in = cs->state + from,
+ .input = *s,
+ .to = ASSERT_HALT(to),
+ .pattern_width = cs->width,
+ .match_width = 1,
+ };
+ vector_push(®ex->delta_table,
+ &delta);
+ }
+}
+
+static
+void ABSOLUTE_OFFSHOOT(const int from,
+ const int to,
+ const int width,
+ const int match_width,
+ regex_t * regex) {
+ offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
+ *offshoot = (offshoot_t){
+ .in = from,
+ .to = to,
+ .pattern_width = width,
+ .match_width = match_width,
+ };
+ vector_push(®ex->catch_table,
+ &offshoot);
+}
+
+static
+void OFFSHOOT(const int from,
+ const int to,
+ const int width,
+ const int match_width,
+ const compiler_state * cs,
+ regex_t * regex) {
+ ABSOLUTE_OFFSHOOT(cs->state + from, ASSERT_HALT(to), width, match_width, regex);
+}
+
+static
+int escape_1_to_1(const char c,
+ const compiler_state * const cs) {
+ char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
+ switch (c) {
+ case 't': {
+ strcat(target_list, "\t");
+ } return 1;
+ case 'n': {
+ strcat(target_list, "\n");
+ } return 1;
+ case 'r': {
+ strcat(target_list, "\r");
+ } return 1;
+ case 'b': {
+ strcat(target_list, "\b");
+ } return 1;
+ case '[': {
+ strcat(target_list, "[");
+ } return 1;
+ case ']': {
+ strcat(target_list, "]");
+ } return 1;
+ case '.': {
+ strcat(target_list, ".");
+ } return 1;
+ case '^': {
+ strcat(target_list, "^");
+ } return 1;
+ case '=': {
+ strcat(target_list, "=");
+ } return 1;
+ case '?': {
+ strcat(target_list, "?");
+ } return 1;
+ case '+': {
+ strcat(target_list, "+");
+ } return 1;
+ case '*': {
+ strcat(target_list, "*");
+ } return 1;
+ case '\\': {
+ strcat(target_list, "\\");
+ } return 1;
+ }
+
+ return 0;
+}
+
+static
+int escape_1_to_N(const char c,
+ const compiler_state * const cs) {
+ char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
+ switch(c) {
+ case 'i': {
+ const char identifier_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_digits
+ JEGER_CHAR_SET_oct_300_to_337
+ ;
+ strcpy(target_list, identifier_chars);
+ return sizeof(identifier_chars)-1;
+ };
+ case 'I': {
+ const char identifier_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_oct_300_to_337
+ ;
+ strcpy(target_list, identifier_chars);
+ return sizeof(identifier_chars)-1;
+ };
+ case 'k': {
+ const char keyword_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_digits
+ JEGER_CHAR_SET_oct_300_to_337
+ ;
+ strcpy(target_list, keyword_chars);
+ return sizeof(keyword_chars)-1;
+ };
+ case 'K': {
+ const char keyword_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_oct_300_to_337
+ ;
+ strcpy(target_list, keyword_chars);
+ return sizeof(keyword_chars)-1;
+ };
+ case 'f': {
+ const char filename_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_digits
+ JEGER_CHAR_SET_file_extra
+ ;
+ strcpy(target_list, filename_chars);
+ return sizeof(filename_chars)-1;
+ };
+ case 'F': {
+ const char filename_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_file_extra
+ ;
+ strcpy(target_list, filename_chars);
+ return sizeof(filename_chars)-1;
+ };
+ case 'p': {
+ const char printable_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_oct_241_to_277
+ JEGER_CHAR_SET_oct_300_to_337
+ ;
+ strcpy(target_list, printable_chars);
+ return sizeof(printable_chars)-1;
+ };
+ case 'P': {
+ const char printable_chars[] = JEGER_CHAR_SET_at
+ JEGER_CHAR_SET_oct_241_to_277
+ JEGER_CHAR_SET_oct_300_to_337
+ ;
+ strcpy(target_list, printable_chars);
+ return sizeof(printable_chars)-1;
+ };
+ case 's': {
+ const char whitespace_chars[] = JEGER_CHAR_SET_whitespace;
+ strcpy(target_list, whitespace_chars);
+ return sizeof(whitespace_chars)-1;
+ };
+ case 'd': {
+ const char digit_chars[] = JEGER_CHAR_SET_digits;
+ strcpy(target_list, digit_chars);
+ return sizeof(digit_chars)-1;
+ };
+ case 'x': {
+ const char hex_chars[] = JEGER_CHAR_SET_digits
+ JEGER_CHAR_SET_lower_hex
+ JEGER_CHAR_SET_upper_hex
+ ;
+ strcpy(target_list, hex_chars);
+ return sizeof(hex_chars)-1;
+ };
+ case 'o': {
+ const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
+ strcpy(target_list, oct_chars);
+ return sizeof(oct_chars)-1;
+ };
+ case 'w': {
+ const char word_chars[] = JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_digits
+ JEGER_CHAR_SET_lower
+ JEGER_CHAR_SET_upper
+ ;
+ strcpy(target_list, word_chars);
+ return sizeof(word_chars)-1;
+ };
+ case 'h': {
+ // #global JEGER_CHAR_very_word_chars
+ strcpy(target_list, JEGER_CHAR_very_word_chars);
+ return sizeof(JEGER_CHAR_very_word_chars)-1;
+ };
+ case 'a': {
+ const char alpha_chars[] = JEGER_CHAR_SET_lower
+ JEGER_CHAR_SET_upper
+ ;
+ strcpy(target_list, alpha_chars);
+ return sizeof(alpha_chars)-1;
+ };
+ case 'l': {
+ const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
+ strcpy(target_list, lower_alpha_chars);
+ return sizeof(lower_alpha_chars)-1;
+ };
+ case 'u': {
+ const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
+ strcpy(target_list, upper_alpha_chars);
+ return sizeof(upper_alpha_chars)-1;
+ };
+ }
+
+ return 0;
+}
+
+static inline
+int escape_to_negative(const char c,
+ compiler_state * const cs) {
+ switch (c) {
+ case 'D': {
+ const char digit_chars[] = JEGER_CHAR_SET_digits;
+ strcpy(cs->blacklist, digit_chars);
+ cs->flags |= IS_NEGATIVE;
+ return sizeof(digit_chars)-1;
+ };
+ case 'X': {
+ const char hex_chars[] = JEGER_CHAR_SET_digits
+ JEGER_CHAR_SET_lower_hex
+ JEGER_CHAR_SET_upper_hex
+ ;
+ strcpy(cs->blacklist, hex_chars);
+ cs->flags |= IS_NEGATIVE;
+ return sizeof(hex_chars)-1;
+ };
+ case 'O': {
+ const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
+ strcpy(cs->blacklist, oct_chars);
+ cs->flags |= IS_NEGATIVE;
+ return sizeof(oct_chars)-1;
+ };
+ case 'W': {
+ const char word_chars[] = JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_digits
+ JEGER_CHAR_SET_lower
+ JEGER_CHAR_SET_upper
+ ;
+ strcpy(cs->blacklist, word_chars);
+ cs->flags |= IS_NEGATIVE;
+ return sizeof(word_chars)-1;
+ };
+ case 'L': {
+ const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
+ strcpy(cs->blacklist, lower_alpha_chars);
+ cs->flags |= IS_NEGATIVE;
+ return sizeof(lower_alpha_chars)-1;
+ };
+ case 'U': {
+ const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
+ strcpy(cs->blacklist, upper_alpha_chars);
+ cs->flags |= IS_NEGATIVE;
+ return sizeof(upper_alpha_chars)-1;
+ };
+ }
+
+ return 0;
+}
+
+static inline
+int compile_dot(compiler_state * const cs) {
+ cs->flags |= DO_CATCH;
+ return true;
+}
+
+static inline
+int compile_escape(const char c,
+ compiler_state * const cs) {
+
+ return escape_1_to_1(c, cs)
+ || escape_1_to_N(c, cs)
+ || escape_to_negative(c, cs)
+ ;
+}
+
+static
+int compile_range(const char * const range,
+ compiler_state * const cs) {
+ assert((range[0] == '[') && "Not a range.");
+
+ const char * s;
+ if (range[1] == '^') {
+ cs->flags |= IS_NEGATIVE;
+ s = range + 2;
+ } else {
+ s = range + 1;
+ }
+
+ char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
+
+ for (; *s != ']'; s++) {
+ assert((*s != '\0') && "Unclosed range.");
+ char c = *s;
+ if (c == '\\') {
+ s += 1;
+ assert(compile_escape(*s, cs) && "Unknown escape.");
+ } else if (*(s+1) == '-') {
+ char end = *(s+2);
+ assert((c < end) && "Endless range.");
+ for (char cc = c; cc < end+1; cc++) {
+ strncat(target_list, &cc, 1);
+ strncat(target_list, "\0", 1);
+ }
+ s += 2;
+ } else {
+ strncat(target_list, &c, 1);
+ }
+ }
+
+ return ((s - range) + 1);
+}
+
+static
+void filter_blacklist(const char * whitelist,
+ const char * blacklist,
+ char * filtered) {
+ for (; *blacklist != '\0'; blacklist++) {
+ for (; *whitelist != '\0'; whitelist++) {
+ if (*blacklist == *whitelist) {
+ goto long_continue;
+ }
+ }
+ strncat(filtered, blacklist, 1);
+ long_continue:
+ ;
+ }
+}
+
+regex_t * regex_compile(const char * const pattern) {
+ regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
+ regex->str = strdup(pattern);
+ vector_init(®ex->delta_table, sizeof(delta_t*), 0UL);
+ vector_init(®ex->catch_table, sizeof(offshoot_t*), 0UL);
+
+ char whitelist[64];
+ char blacklist[64];
+
+ compiler_state cs = {
+ .flags = IS_AT_THE_BEGINNING,
+ .state = JEGER_INIT_STATE,
+ .whitelist = whitelist,
+ .blacklist = blacklist,
+ };
+
+ for (const char * s = pattern; *s != '\00';) {
+ assert(!is_quantifier(*s) && "Pattern starts with quantifier.");
+ // Reset the compiler
+ whitelist[0] = '\0';
+ blacklist[0] = '\0';
+ cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
+ cs.width = 1;
+
+ // Translate char
+ switch (*s) {
+ case '^': {
+ ;
+ } break;
+ case '.': {
+ compile_dot(&cs);
+ s += 1;
+ } break;
+ case '\\': {
+ s += 1;
+ if (compile_escape(*s, &cs)) {
+ s += 1;
+ } else if (is_hologram_escape(*s)) {
+ ;
+ } else {
+ assert("Unknown escape.");
+ }
+ } break;
+ case '[': {
+ s += compile_range(s, &cs);
+ } break;
+ default: { // Literal
+ whitelist[0] = *s;
+ whitelist[1] = '\0';
+ s += 1;
+ } break;
+ }
+
+ // Compile char
+ switch (*s) {
+ // holograms
+ case '^': {
+ whitelist[0] = '\n';
+ whitelist[1] = '\0';
+ HOOK_ALL(0, whitelist, 0, &cs, regex);
+ if (cs.flags & IS_AT_THE_BEGINNING) {
+ cs.flags |= FORCE_START_OF_STRING;
+ } else {
+ cs.flags |= INCREMENT_STATE;
+ }
+ s += 1;
+ } break;
+ case '<': {
+ cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
+ if (cs.flags & IS_AT_THE_BEGINNING) {
+ ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex);
+ }
+ strcat(blacklist, JEGER_CHAR_very_word_chars);
+ OFFSHOOT(0, 0, 1, 0, &cs, regex);
+ s += 1;
+ } break;
+ case '>': {
+ cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
+ strcat(blacklist, JEGER_CHAR_very_word_chars);
+ OFFSHOOT(0, 1, 0, 0, &cs, regex);
+ s += 1;
+ } break;
+ // quantifiers
+ case '=':
+ case '?': {
+ HOOK_ALL(0, whitelist, +1, &cs, regex);
+ if ((cs.flags & DO_CATCH)
+ || (cs.flags & IS_NEGATIVE)) {
+ OFFSHOOT(0, +1, 1, 1, &cs, regex);
+ }
+ s += 1;
+ } break;
+ case '*': {
+ HOOK_ALL(0, whitelist, 0, &cs, regex);
+ if ((cs.flags & DO_CATCH)
+ || (cs.flags & IS_NEGATIVE)) {
+ OFFSHOOT(0, 0, 1, 1, &cs, regex);
+ }
+ s += 1;
+ } break;
+ case '+': {
+ cs.flags |= INCREMENT_STATE;
+ HOOK_ALL(0, whitelist, +1, &cs, regex);
+ if ((cs.flags & DO_CATCH)
+ || (cs.flags & IS_NEGATIVE)) {
+ OFFSHOOT(0, +1, 1, 1, &cs, regex);
+ }
+ HOOK_ALL(+1, whitelist, +1, &cs, regex);
+ if ((cs.flags & DO_CATCH)
+ || (cs.flags & IS_NEGATIVE)) {
+ OFFSHOOT(+1, +1, 1, 1, &cs, regex);
+ }
+ s += 1;
+ } break;
+ default: { // Literal
+ cs.flags |= INCREMENT_STATE;
+ HOOK_ALL(0, whitelist, +1, &cs, regex);
+ if ((cs.flags & DO_CATCH)
+ || (cs.flags & IS_NEGATIVE)) {
+ OFFSHOOT(0, +1, 1, 1, &cs, regex);
+ }
+ } break;
+ }
+
+ // Compile blacklist
+ if (*blacklist) {
+ char filtered_blacklist[64];
+ filtered_blacklist[0] = '\0';
+ filter_blacklist(whitelist, blacklist, filtered_blacklist);
+ HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs, regex);
+ }
+
+ if (cs.flags & INCREMENT_STATE) {
+ ++cs.state;
+ }
+
+ cs.flags &= (~IS_AT_THE_BEGINNING);
+ }
+
+ // Init state hookups
+ ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE, 0, 0, regex);
+ if (cs.flags & FORCE_START_OF_STRING) {
+ ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, regex);
+ } else {
+ ABSOLUTE_OFFSHOOT(1, JEGER_INIT_STATE, 0, 0, regex);
+ }
+
+ regex->accepting_state = cs.state;
+
+ return regex;
+}
+
+int regex_free(regex_t * const regex) {
+ free(regex->str);
+ vector_free(®ex->delta_table);
+ vector_free(®ex->catch_table);
+ free(regex);
+ return 0;
+}
+
+
+
+// -----------------
+// ### Searching ###
+// -----------------
+static
+const offshoot_t * catch_table_lookup(const regex_t * const regex,
+ const int * const state) {
+ for (size_t i = 0; i < regex->catch_table.element_count; i++){
+ const offshoot_t * const offshoot = *(offshoot_t**)vector_get(®ex->catch_table, i);
+ if (offshoot->in == *state) {
+ return offshoot;
+ }
+ }
+ return NULL;
+}
+
+static
+bool regex_assert(const regex_t * const regex,
+ const char * const string,
+ int state,
+ match_t * const match) {
+ if (state == HALT_AND_CATCH_FIRE) {
+ return false;
+ }
+
+ bool last_stand = false;
+ bool was_found;
+
+ const char * s = string;
+ LOOP: {
+ was_found = false;
+ if (*s == '\0') {
+ last_stand = true;
+ goto PERFORM_CATCH_LOOKUP;
+ }
+ // Jump search for the correct state
+ const int jump = 10;
+ size_t i = jump;
+ while (i < regex->delta_table.element_count) {
+ const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i);
+ if (delta->in >= state) {
+ break;
+ }
+ i += jump;
+ }
+ i -= jump;
+ // Linear search finish up
+ for (; i < regex->delta_table.element_count; i++) {
+ const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i);
+
+ if (delta->in > state) {
+ break;
+ }
+
+ if ((delta->in == state)
+ && (delta->input == *s)) {
+ was_found = true;
+ const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
+ if(r){
+ if (match->position == -1) {
+ match->position = (s - string);
+ }
+ match->width += delta->match_width;
+ return r;
+ }
+ }
+ }
+ }
+
+ PERFORM_CATCH_LOOKUP: {
+ if (!was_found) {
+ const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
+ if (my_catch && (!my_catch->pattern_width || !last_stand)) {
+ state = my_catch->to;
+ s += my_catch->pattern_width;
+ match->width += my_catch->match_width;
+ goto LOOP;
+ }
+ }
+ }
+
+ return (state == regex->accepting_state);
+}
+
+match_t * regex_match(const regex_t * const regex,
+ const char * const string,
+ const bool is_start_of_string) {
+
+ vector_t matches;
+ vector_init(&matches, sizeof(match_t), 0);
+
+ match_t * match = (match_t *)malloc(sizeof(match_t));
+
+ /* Non-existent regex does not match anything.
+ * Not to be confused with an empty regex.
+ */
+ if (regex == NULL) {
+ goto FINISH;
+ }
+
+ // Find all matches
+ {
+ const char * s = string;
+ do {
+ int initial_state;
+ initial_state = (int)(!(is_start_of_string && (s == string)));
+
+ *match = (match_t){
+ .position = -1,
+ .width = 0,
+ };
+
+ if (regex_assert(regex, s, initial_state, match)) {
+ match->position = (s - string);
+
+ vector_push(&matches, match);
+
+ s += ((match->width > 0) ? match->width : 1);
+ match = (match_t *)malloc(sizeof(match_t));
+ } else {
+ ++s;
+ }
+ } while (*s != '\0');
+ }
+
+ FINISH:
+
+ // Insert sentinel
+ *match = (match_t){
+ .position = -1,
+ .width = -1,
+ };
+ vector_push(&matches, match);
+
+ // Hide internal vector usage
+ const size_t data_size = matches.element_size * matches.element_count;
+ match_t * r = (match_t *)malloc(data_size);
+ memcpy(r, matches.data, data_size);
+ vector_free(&matches);
+
+ return r;
+}
+
+bool regex_search(const regex_t * const regex,
+ const char * const string) {
+
+ match_t * m = regex_match(regex, string, true);
+ const bool r = (m->position != -1);
+ free(m);
+
+ return r;
+}
+++ /dev/null
-/* regex.c
- * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams
- * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */
-
-#include "regex.h"
-
-#include <assert.h>
-#include <string.h>
-#include <limits.h>
-#include <stdlib.h>
-
-// ------------------
-// ### Char tests ###
-// ------------------
-static bool is_quantifier(const char c) {
- for (const char * s = "+*?="; *s != '\00'; s++) {
- if (*s == c) {
- return true;
- }
- }
- return false;
-}
-
-bool is_magic(const char c) {
- if (is_quantifier(c)) {
- return true;
- }
- for (const char * s = "\\[].^"; *s != '\00'; s++) {
- if (*s == c) {
- return true;
- }
- }
- return false;
-}
-
-// ----------------------
-// ### Internal Types ###
-// ----------------------
-typedef struct {
- int in;
- char input;
- int to;
- int width;
-} delta_t;
-
-typedef struct {
- int in;
- int to;
- int width;
-} offshoot_t;
-
-typedef struct {
- // XXX:
- // These should share a mask
- // Not even sure why they are pointers to begin with
- bool * do_catch;
- bool * is_negative;
- bool is_at_the_beginning;
- bool do_skip;
-// these might be obsolite but im leaving them for now
- bool * do_loop_hook;
- bool * do_follow_hook;
- bool * do_loop_shoot;
- bool * do_follow_shoot;
-// ---
- int * state;
- int * width;
- char * whitelist;
- char * blacklist;
- regex_t * regex;
-} compiler_state;
-
-
-
-// ----------------------------------
-// ### Regex creation/destruction ###
-// ----------------------------------
-#define HALT_AND_CATCH_FIRE INT_MIN
-
-static void HOOK_ALL( int from,
- const char * const str,
- int to,
- compiler_state * cs) {
-
- int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to);
-
-
- for (const char * s = str; *s != '\0'; s++) {
- delta_t * delta = malloc(sizeof(delta_t));
- delta->in = *cs->state + from;
- delta->input = *s;
- delta->to = hook_to;
- delta->width = *cs->width;
- vector_push(&cs->regex->delta_table,
- &delta);
- }
-}
-
-static void ABSOLUTE_OFFSHOOT(int from,
- int to,
- int width,
- compiler_state * cs) {
- offshoot_t * offshoot = malloc(sizeof(offshoot_t));
- offshoot->in = from;
- offshoot->to = to;
- offshoot->width = width;
- vector_push(&cs->regex->catch_table,
- &offshoot);
-}
-
-static void OFFSHOOT(int from,
- int to,
- int width,
- compiler_state * cs) {
- ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs);
-}
-
-static int escape_1_to_1(const char c, compiler_state * cs) {
- char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
- switch (c) {
- case 't': {
- strcat(target_list, "\t");
- } return 1;
- case 'n': {
- strcat(target_list, "\n");
- } return 1;
- case 'r': {
- strcat(target_list, "\r");
- } return 1;
- case 'b': {
- strcat(target_list, "\b");
- } return 1;
- case '[': {
- strcat(target_list, "[");
- } return 1;
- case ']': {
- strcat(target_list, "]");
- } return 1;
- case '.': {
- strcat(target_list, ".");
- } return 1;
- case '^': {
- strcat(target_list, "^");
- } return 1;
- case '=': {
- strcat(target_list, "=");
- } return 1;
- case '?': {
- strcat(target_list, "?");
- } return 1;
- case '+': {
- strcat(target_list, "+");
- } return 1;
- case '*': {
- strcat(target_list, "*");
- } return 1;
- case '\\': {
- strcat(target_list, "\\");
- } return 1;
- }
-
- return 0;
-}
-
-static int escape_1_to_N(const char c, compiler_state * cs) {
- char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
- switch(c) {
- case 'i': {
- const char identifier_chars[] = "@0123456789_"
- "\300\301\302\303\304"
- "\305\306\307\310\311"
- "\312\313\314\315\316"
- "\317\320\321\322\323"
- "\324\325\326\327\330"
- "\331\332\333\334\335"
- "\336\337";
- strcpy(target_list, identifier_chars);
- return sizeof(identifier_chars)-1;
- };
- case 'I': {
- const char identifier_chars[] = "@_"
- "\300\301\302\303\304"
- "\305\306\307\310\311"
- "\312\313\314\315\316"
- "\317\320\321\322\323"
- "\324\325\326\327\330"
- "\331\332\333\334\335"
- "\336\337";
- strcpy(target_list, identifier_chars);
- return sizeof(identifier_chars)-1;
- };
- case 'k': {
- const char keyword_chars[] = "@0123456789_"
- "\300\301\302\303\304"
- "\305\306\307\310\311"
- "\312\313\314\315\316"
- "\317\320\321\322\323"
- "\324\325\326\327\330"
- "\331\332\333\334\335"
- "\336\337";
- strcpy(target_list, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'K': {
- const char keyword_chars[] = "@_"
- "\300\301\302\303\304"
- "\305\306\307\310\311"
- "\312\313\314\315\316"
- "\317\320\321\322\323"
- "\324\325\326\327\330"
- "\331\332\333\334\335"
- "\336\337";
- strcpy(target_list, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'f': {
- const char filename_chars[] = "@0123456789/.-_+,#$%~=";
- strcpy(target_list, filename_chars);
- return sizeof(filename_chars)-1;
- };
- case 'F': {
- const char filename_chars[] = "@/.-_+,#$%~=";
- strcpy(target_list, filename_chars);
- return sizeof(filename_chars)-1;
- };
- case 'p': {
- const char printable_chars[] = "@"
- "\241\242\243\244\245"
- "\246\247\250\251\252"
- "\253\254\255\256\257"
- "\260\261\262\263\264"
- "\265\266\267\270\271"
- "\272\273\274\275\276"
- "\277"
- "\300\301\302\303\304"
- "\305\306\307\310\311"
- "\312\313\314\315\316"
- "\317\320\321\322\323"
- "\324\325\326\327\330"
- "\331\332\333\334\335"
- "\336\337";
- strcpy(target_list, printable_chars);
- return sizeof(printable_chars)-1;
- };
- case 'P': {
- const char printable_chars[] = "@"
- "\241\242\243\244\245"
- "\246\247\250\251\252"
- "\253\254\255\256\257"
- "\260\261\262\263\264"
- "\265\266\267\270\271"
- "\272\273\274\275\276"
- "\277"
- "\300\301\302\303\304"
- "\305\306\307\310\311"
- "\312\313\314\315\316"
- "\317\320\321\322\323"
- "\324\325\326\327\330"
- "\331\332\333\334\335"
- "\336\337";
- strcpy(target_list, printable_chars);
- return sizeof(printable_chars)-1;
- };
- case 's': {
- const char whitespace_chars[] = " \t\v\n";
- strcpy(target_list, whitespace_chars);
- return sizeof(whitespace_chars)-1;
- };
- case 'd': {
- const char digit_chars[] = "0123456789";
- strcpy(target_list, digit_chars);
- return sizeof(digit_chars)-1;
- };
- case 'x': {
- const char hex_chars[] = "0123456789"
- "abcdef"
- "ABCDEF";
- strcpy(target_list, hex_chars);
- return sizeof(hex_chars)-1;
- };
- case 'o': {
- const char oct_chars[] = "01234567";
- strcpy(target_list, oct_chars);
- return sizeof(oct_chars)-1;
- };
- case 'w': {
- const char word_chars[] = "0123456789"
- "abcdefghijklmnopqrstuwxyz"
- "ABCDEFGHIJKLMNOPQRSTUWXYZ"
- "_";
- strcpy(target_list, word_chars);
- return sizeof(word_chars)-1;
- };
- case 'h': {
- const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
- "ABCDEFGHIJKLMNOPQRSTUWXYZ"
- "_";
- strcpy(target_list, very_word_chars);
- return sizeof(very_word_chars)-1;
- };
- case 'a': {
- const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
- "ABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(target_list, alpha_chars);
- return sizeof(alpha_chars)-1;
- };
- case 'l': {
- const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
- strcpy(target_list, lower_alpha_chars);
- return sizeof(lower_alpha_chars)-1;
- };
- case 'u': {
- const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(target_list, upper_alpha_chars);
- return sizeof(upper_alpha_chars)-1;
- };
- }
-
- return 0;
-}
-
-static int escape_to_negative(const char c,
- compiler_state * cs) {
- switch (c) {
- case 'D': {
- const char digit_chars[] = "0123456789";
- strcpy(cs->blacklist, digit_chars);
- *cs->is_negative = true;
- return sizeof(digit_chars)-1;
- };
- }
-
- return 0;
-}
-
-static int escape_hologram(const char c, compiler_state * cs) {
- switch (c) {
- case '<': {
- if (cs->is_at_the_beginning) {
- ABSOLUTE_OFFSHOOT(0, 2, 0, cs);
- cs->do_skip = true;
- }
- const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
- "ABCDEFGHIJKLMNOPQRSTUWXYZ"
- "_";
- *cs->is_negative = true; // effectless currently; should be used to trigger the following lines in the main compile loop
- strcat(cs->blacklist, very_word_chars);
- HOOK_ALL(0, cs->blacklist, HALT_AND_CATCH_FIRE, cs);
- OFFSHOOT(0, 0, 1, cs);
-
- return sizeof(very_word_chars)-1;
- };
- case '>': {
- const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
- "ABCDEFGHIJKLMNOPQRSTUWXYZ"
- "_";
- *cs->is_negative = true;
- strcat(cs->blacklist, very_word_chars);
-
- return 1;
- }
- }
- return 0;
-}
-
-static int compile_dot(compiler_state * cs) {
- *cs->do_catch = true;
- return true;
-}
-
-static int compile_escape(const char c,
- compiler_state * cs) {
-
- return escape_1_to_1(c, cs)
- || escape_1_to_N(c, cs)
- || escape_to_negative(c, cs)
- || escape_hologram(c, cs)
- ;
-}
-
-static int compile_range(const char * const range,
- compiler_state * cs) {
- assert((range[0] == '[') && "Not a range.");
-
- const char * s;
- if (range[1] == '^') {
- *cs->is_negative = true;
- s = range + 2;
- } else {
- s = range + 1;
- }
-
- char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
-
- for (; *s != ']'; s++) {
- assert((*s != '\0') && "Unclosed range.");
- char c = *s;
- if (c == '\\') {
- s += 1;
- assert(compile_escape(*s, cs) && "Unknown escape.");
- } else if (*(s+1) == '-') {
- char end = *(s+2);
- assert((c < end) && "Endless range.");
- for (char cc = c; cc < end+1; cc++) {
- strncat(target_list, &cc, 1);
- strncat(target_list, "\0", 1);
- }
- s += 2;
- } else {
- strncat(target_list, &c, 1);
- }
- }
-
- return ((s - range) + 1);
-}
-
-void filter_blacklist(const char * whitelist,
- const char * blacklist,
- char * filtered) {
- for (; *blacklist != '\0'; blacklist++) {
- for(; *whitelist != '\0'; whitelist++) {
- if (*blacklist == *whitelist) {
- goto long_continue;
- }
- }
- strncat(filtered, blacklist, 1);
- long_continue:
- ;
- }
-}
-
-regex_t * regex_compile(const char * const pattern) {
- regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
- regex->str = strdup(pattern);
- vector_init(®ex->delta_table, sizeof(delta_t*), 0UL);
- vector_init(®ex->catch_table, sizeof(offshoot_t*), 0UL);
-
- int state = 2;
-
- // this is plain retarded
- bool do_catch;
- bool is_negative;
- bool do_loop_hook;
- bool do_follow_hook;
- bool do_loop_shoot;
- bool do_follow_shoot;
- int width;
- char whitelist[64];
- char blacklist[64];
-
- compiler_state cs = {
- .do_catch = &do_catch,
- .is_negative = &is_negative,
- .is_at_the_beginning = true,
- .do_skip = false,
- .state = &state,
- .width = &width,
- .whitelist = whitelist,
- .blacklist = blacklist,
- .regex = regex,
- };
-
- for (const char * s = pattern; *s != '\00';) {
- // Reset the compiler
- assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
- whitelist[0] = '\0';
- blacklist[0] = '\0';
- do_catch = false;
- is_negative = false;
- cs.do_skip = false;
- /**/
- do_loop_hook = false;
- do_follow_hook = false;
- do_loop_shoot = false;
- do_follow_shoot = false;
- /**/
- width = 1;
-
- // Translate char
- switch (*s) {
- case '^': {
- if (cs.is_at_the_beginning) {
- ABSOLUTE_OFFSHOOT(0, 2, 0, &cs);
- ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs);
- }
- whitelist[0] = '\n';
- whitelist[1] = '\0';
- HOOK_ALL(0, whitelist, 0, &cs);
- if (s != pattern) {
- state += 1;
- }
- cs.do_skip = true;
- } break;
- case '.': {
- compile_dot(&cs);
- } break;
- case '\\': {
- s += 1;
- assert(compile_escape(*s, &cs) && "Unknown escape.");
- } break;
- case '[': {
- s += compile_range(s, &cs) - 1;
- } break;
- default: {
- whitelist[0] = *s;
- whitelist[1] = '\0';
- } break;
- }
-
- s += 1;
-
- if (cs.do_skip) {
- goto long_continue;
- }
-
- // Compile with quantifier
- switch (*s) {
- case '=':
- case '?': {
- do_loop_hook = true;
- HOOK_ALL(0, whitelist, +1, &cs);
- if (do_catch || is_negative) {
- OFFSHOOT(0, +1, 1, &cs);
- }
- s += 1;
- } break;
- case '*': {
- HOOK_ALL(0, whitelist, 0, &cs);
- if (do_catch) {
- OFFSHOOT(0, +1, 1, &cs);
- } else if (is_negative) {
- OFFSHOOT(0, 0, 1, &cs);
- }
- s += 1;
- } break;
- case '+': {
- HOOK_ALL(0, whitelist, +1, &cs);
- if (do_catch || is_negative) {
- OFFSHOOT(0, +1, 1, &cs);
- }
- state += 1;
- HOOK_ALL(0, whitelist, 0, &cs);
- if (do_catch || is_negative) {
- OFFSHOOT(0, 0, 1, &cs);
- }
- s += 1;
- } break;
- default: { // Literal
- HOOK_ALL(0, whitelist, +1, &cs);
- if (do_catch || is_negative) {
- OFFSHOOT(0, +1, 1, &cs);
- }
- state += 1;
- } break;
- }
-
- // Compile blacklist
- if (*blacklist) {
- char filtered_blacklist[64];
- filtered_blacklist[0] = '\0';
- filter_blacklist(whitelist, blacklist, filtered_blacklist);
- HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
- }
-
- long_continue:
- cs.is_at_the_beginning = false;
- long_continue:;
- }
-
- regex->accepting_state = state;
-
- return regex;
-}
-
-int regex_free(regex_t * const regex) {
- free(regex->str);
- vector_free(®ex->delta_table);
- vector_free(®ex->catch_table);
- free(regex);
- return 0;
-}
-
-
-
-// -----------------
-// ### Searching ###
-// -----------------
-static int catch_(const regex_t * const regex,
- int * const state) {
- for (size_t i = 0; i < regex->catch_table.element_count; i++){
- const offshoot_t * const offshoot = *(offshoot_t**)vector_get(®ex->catch_table, i);
- if (offshoot->in == *state) {
- *state = offshoot->to;
- return offshoot->width;
- }
- }
- return HALT_AND_CATCH_FIRE;
-}
-
-static int regex_assert(const regex_t * const regex,
- const char * const string,
- const int string_offset,
- int state,
- int width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s
- for (const char * s = (string + string_offset); *s != '\00';) {
- // XXX: this should be a jump search for the instate and then a linear
- // delta
- //int left = 0;
- //int right = regex->delta_table.element_count - 1;
- //int i;
- //while(left <= right) }
- for (size_t i = 0; i < regex->delta_table.element_count; i++) {
- //i = (left + right) / 2;
- const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i);
- if ((delta->in == state)
- && (delta->input == *s)) {
- int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, width + 1);
- if(r){
- return r;
- }
- }
- }
-
- const int catch_width = catch_(regex, &state);
- if ((catch_width != HALT_AND_CATCH_FIRE)
- && (state != HALT_AND_CATCH_FIRE)) {
- s += catch_width;
- continue;
- }
-
- // XXX: the extra catch might not be necessary if we were to compile to a simpler form
- catch_(regex, &state);
- return (state == regex->accepting_state) ? width : false;
- }
-
- return false;
-}
-
-int regex_match( regex_t * regex,
- const char * const string,
- const bool is_start_of_string,
- const int string_offset) { // XXX: remove this useless piece of shit of a parameter nigger
- if (regex == NULL) {
- return false;
- }
- if (string == NULL) {
- return true;
- }
-
- const int initial_state = (int)(!is_start_of_string);
-
- return regex_assert(regex, string, string_offset, initial_state, 0);
-}
-
-bool regex_search( regex_t * regex,
- const char * const string) {
-
- return (bool)regex_match(regex, string, true, 0);
-}