]> git.xolatile.top Git - public-libhl.git/commitdiff
i deserve a blowjob
authoranon <anon@anon.anon>
Sat, 23 Sep 2023 15:06:44 +0000 (17:06 +0200)
committeranon <anon@anon.anon>
Sat, 23 Sep 2023 15:06:44 +0000 (17:06 +0200)
include/jeger.h
source/hl.c
source/jeger.c

index 5c6d622c9ccd67e16b0eda33b2d70b97dbc8d3a5..97efd03f6d59fc0ba3825f97a4faf5d2163b311b 100644 (file)
@@ -15,7 +15,10 @@ typedef struct {
 } regex_t;
 
 typedef struct {
-       int position;
+       union {
+               int          position;
+               const char * _pos_ptr;
+       };
        int width;
 } match_t;
 
@@ -25,5 +28,6 @@ extern bool      regex_search(const regex_t * const regex, const char * const st
 extern match_t * regex_match(const regex_t * const regex, const char * const string, const bool start_of_string);
 
 extern bool is_magic(const char c);
+extern bool is_sentinel(const match_t * const match);
 
 #endif
index 96b8343f648fb8e6c9499cb49ac1ec4469831ba1..6932d2d80e86d755501e49533b41b913bd971cd1 100644 (file)
@@ -195,7 +195,7 @@ void render_string(const char * const string,
                token_t * t = *(token_t**)vector_get(&token_table,
                                                     i);
                match_t * match = regex_match(t->syntax, string, true);
-               if (match->position == -1) {
+               if (is_sentinel(match)) {
                        free(match);
                        continue;
                }
@@ -212,7 +212,7 @@ void render_string(const char * const string,
                max = &sentinel;
                for (int h = 0; h < rrs; h++) {
                        result_t * const current_result = r + h;
-                       for (int j = 0; current_result->m[j].position != -1; j++) {
+                       for (int j = 0; !is_sentinel(&(current_result->m[j])); j++) {
                                if (current_result->m[j].position == (s - string)) {
                                        if (current_result->m[j].width > max->m->width) {
                                                current_result->i = j;
index 5074182e46ac8421c36b0d752558e428964f61ae..343f6390ec7fe36ebe389e92b079ba06a8355be9 100644 (file)
@@ -8,8 +8,13 @@
 #include <string.h>
 #include <limits.h>
 #include <stdlib.h>
+#if DEBUG
+# include <stdio.h>
+#endif
 
-#define JEGER_INIT_STATE    2
+#define JEGER_SOS_STATE   0
+#define JEGER_NSOS_STATE  1
+#define JEGER_INIT_STATE  2
 
 // ------------------
 // ### Char tests ###
@@ -40,6 +45,15 @@ bool is_magic(const char c) {
                ;
 }
 
+// -------------------
+// ### Match tests ###
+// -------------------
+bool is_sentinel(const match_t * const match) {
+       return (match->position == -1)
+           && (match->width    == -1)
+           ;
+}
+
 // -----------------
 // ### Char sets ###
 // -----------------
@@ -52,13 +66,13 @@ bool is_magic(const char c) {
 #define JEGER_CHAR_SET_lower_hex         "abcdef"
 #define JEGER_CHAR_SET_upper_hex         "ABCDEF"
 #define JEGER_CHAR_SET_oct_241_to_277                           \
-                                                    "\241\242\243\244\245" \
-                                                    "\246\247\250\251\252" \
-                                                    "\253\254\255\256\257" \
-                                                    "\260\261\262\263\264" \
-                                                    "\265\266\267\270\271" \
-                                                    "\272\273\274\275\276" \
-                                                    "\277"
+                                         "\241\242\243\244\245" \
+                                         "\246\247\250\251\252" \
+                                         "\253\254\255\256\257" \
+                                         "\260\261\262\263\264" \
+                                         "\265\266\267\270\271" \
+                                         "\272\273\274\275\276" \
+                                         "\277"
 #define JEGER_CHAR_SET_oct_300_to_337                           \
                                          "\300\301\302\303\304" \
                                          "\305\306\307\310\311" \
@@ -68,13 +82,13 @@ bool is_magic(const char c) {
                                          "\331\332\333\334\335" \
                                          "\336\337"
 #define JEGER_CHAR_SET_file_extra        "/.-_+,#$%~="
-#define JEGER_CHAR_SET_whitespace        " \t\v\n"
+#define JEGER_CHAR_SET_whitespace        " " "\t\v\n"
 
-static const char JEGER_CHAR_very_word_chars[] = 
-                                   JEGER_CHAR_SET_underscore
-                                   JEGER_CHAR_SET_lower
-                                   JEGER_CHAR_SET_upper
-                                 ;
+static const char JEGER_CHAR_symbol_chars[] = 
+                                             JEGER_CHAR_SET_underscore
+                                             JEGER_CHAR_SET_lower
+                                             JEGER_CHAR_SET_upper
+                                         ;
 
 // ----------------------
 // ### Internal Types ###
@@ -95,17 +109,19 @@ typedef struct {
 } offshoot_t;
 
 enum {
-       DO_CATCH              = 0x00000001 << 0,
-       IS_NEGATIVE           = 0x00000001 << 1,
-       IS_AT_THE_BEGINNING   = 0x00000001 << 2,
-       FORCE_START_OF_STRING = 0x00000001 << 3,
-       INCREMENT_STATE       = 0x00000001 << 4,
+       DO_CATCH                  = 0x00000001 << 0,
+       IS_NEGATIVE               = 0x00000001 << 1,
+       IS_AT_THE_BEGINNING       = 0x00000001 << 2,
+       FORCE_START_OF_STRING     = 0x00000001 << 3,
+       DO_FORBID_START_OF_STRING = 0x00000001 << 4,
+       INCREMENT_STATE           = 0x00000001 << 5,
 };
 
 typedef struct {
        int       flags;
        int       state;
        int       width;
+       int       width2;
        char    * whitelist;
        char    * blacklist;
 } compiler_state;
@@ -132,7 +148,7 @@ void HOOK_ALL(const int                         from,
                        .input         = *s,
                        .to            = ASSERT_HALT(to),
                        .pattern_width = cs->width,
-                       .match_width   = 1,
+                       .match_width   = cs->width2,
                };
                vector_push(&regex->delta_table,
                            &delta);
@@ -318,9 +334,9 @@ int escape_1_to_N(const char                    c,
                        return sizeof(word_chars)-1;
                };
                case 'h': {
-                       // #global JEGER_CHAR_very_word_chars
-                       strcpy(target_list, JEGER_CHAR_very_word_chars);
-                       return sizeof(JEGER_CHAR_very_word_chars)-1;
+                       // #global JEGER_CHAR_symbol_chars
+                       strcpy(target_list, JEGER_CHAR_symbol_chars);
+                       return sizeof(JEGER_CHAR_symbol_chars)-1;
                };
                case 'a': {
                        const char alpha_chars[] = JEGER_CHAR_SET_lower
@@ -346,7 +362,7 @@ int escape_1_to_N(const char                    c,
 
 static inline
 int escape_to_negative(const char                    c,
-                                compiler_state * const cs) {
+                             compiler_state * const cs) {
        switch (c) {
                case 'D': {
                        const char digit_chars[] = JEGER_CHAR_SET_digits;
@@ -488,6 +504,7 @@ regex_t * regex_compile(const char * const pattern) {
                blacklist[0] = '\0';
                cs.flags    &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
                cs.width     = 1;
+               cs.width2    = 1;
 
                // Translate char
                switch (*s) {
@@ -503,7 +520,7 @@ regex_t * regex_compile(const char * const pattern) {
                                if (compile_escape(*s, &cs)) {
                                        s += 1;
                                } else if (is_hologram_escape(*s)) {
-                                       ;
+                                       s -= 1;
                                } else {
                                        assert("Unknown escape.");
                                }
@@ -518,6 +535,12 @@ regex_t * regex_compile(const char * const pattern) {
                        } break;
                }
 
+               /* Ew */
+               if (*s == '\\'
+               &&  is_hologram_escape(*(s+1))) {
+                       ++s;
+               }
+
                // Compile char
                switch (*s) {
                        // holograms
@@ -533,18 +556,47 @@ regex_t * regex_compile(const char * const pattern) {
                                s += 1;
                        } break;
                        case '<': {
-                               cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
-                               if (cs.flags & IS_AT_THE_BEGINNING) {
-                                       ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex);
+                               // XXX: make this legible
+                               if (cs.flags & IS_AT_THE_BEGINNING
+                               && !(cs.flags & DO_CATCH)
+                               && !(cs.flags & IS_NEGATIVE)
+                               && whitelist[0] == '\0') {
+                                       // ---
+                                       cs.flags |= INCREMENT_STATE;
+                                       cs.flags |= DO_FORBID_START_OF_STRING;
+                                       strcat(whitelist, JEGER_CHAR_symbol_chars);
+                                       // ---
+                                       ABSOLUTE_OFFSHOOT( JEGER_SOS_STATE, JEGER_INIT_STATE+1, 0, 0, regex);
+                                       ABSOLUTE_OFFSHOOT(JEGER_INIT_STATE, JEGER_INIT_STATE+2, 1, 0, regex);
+                                       HOOK_ALL(0, whitelist, HALT_AND_CATCH_FIRE, &cs, regex);
+                                       // ---
+                                       ++cs.state;
+                                       cs.width = 0;
+                                       cs.width2 = 0;
+                                       HOOK_ALL(0, whitelist, +1, &cs, regex);
+                                       cs.width = 1;
+                                       OFFSHOOT(0, +1, 1, 0, &cs, regex);
+                                       // ---
+                               } else {
+                                       HOOK_ALL(0, whitelist, +1, &cs, regex);
+                                       if ((cs.flags & DO_CATCH)
+                                       ||  (cs.flags & IS_NEGATIVE)) {
+                                               OFFSHOOT(+1, +2, 1, 1, &cs, regex);
+                                       } else {
+                                               cs.flags |= INCREMENT_STATE;
+                                       }
+                                       OFFSHOOT(0, +1, 1, 0, &cs, regex);
                                }
-                               strcat(blacklist, JEGER_CHAR_very_word_chars);
-                               OFFSHOOT(0, 0, 1, 0, &cs, regex);
+                               cs.flags |= IS_NEGATIVE;
+                               strcat(blacklist, JEGER_CHAR_symbol_chars);
                                s += 1;
                        } break;
                        case '>': {
+                               HOOK_ALL(0, whitelist, +1, &cs, regex);
                                cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
-                               strcat(blacklist, JEGER_CHAR_very_word_chars);
-                               OFFSHOOT(0, 1, 0, 0, &cs, regex); 
+                               strcat(blacklist, JEGER_CHAR_symbol_chars);
+                               OFFSHOOT(+1, +2, 0, 0, &cs, regex); 
+                               ++cs.state;
                                s += 1;
                        } break;
                        // quantifiers
@@ -605,11 +657,13 @@ regex_t * regex_compile(const char * const pattern) {
        }
 
        // Init state hookups
-       ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE, 0, 0, regex);
+       if (!(cs.flags & DO_FORBID_START_OF_STRING)) {
+               ABSOLUTE_OFFSHOOT(JEGER_SOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
+       }
        if (cs.flags & FORCE_START_OF_STRING) {
-               ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, regex);
+               ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, HALT_AND_CATCH_FIRE, 0, 0, regex);
        } else {
-               ABSOLUTE_OFFSHOOT(1,    JEGER_INIT_STATE, 0, 0, regex);
+               ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE,    JEGER_INIT_STATE, 0, 0, regex);
        }
 
        regex->accepting_state = cs.state;
@@ -682,14 +736,18 @@ bool regex_assert(const regex_t * const         regex,
 
                        if ((delta->in == state) 
                        &&  (delta->input == *s)) {
+                               bool do_reset = false;
                                was_found = true;
+                               if (!match->_pos_ptr && delta->match_width) {
+                                       match->_pos_ptr = s;
+                                       do_reset = true;
+                               }
                                const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
                                if(r){
-                                       if (match->position == -1) {
-                                               match->position = (s - string);
-                                       }
                                        match->width += delta->match_width;
                                        return r;
+                               } else if (do_reset) {
+                                       match->_pos_ptr = NULL;
                                }
                        }
                }
@@ -729,17 +787,21 @@ match_t * regex_match(const regex_t * const              regex,
        // Find all matches
        {
                const char * s = string;
+               int initial_state;
                do {
-                       int initial_state;
                        initial_state = (int)(!(is_start_of_string && (s == string)));
 
                        *match = (match_t){
-                               .position = -1,
-                               .width    =  0,
+                               ._pos_ptr = NULL,
+                               .width    =    0,
                        };
 
                        if (regex_assert(regex, s, initial_state, match)) {
-                               match->position = (s - string);
+                               if (match->_pos_ptr) {
+                                       match->position = (match->_pos_ptr - string);
+                               } else {
+                                       match->position = (s - string);
+                               }
 
                                vector_push(&matches, match);
 
@@ -773,7 +835,7 @@ bool regex_search(const regex_t * const  regex,
                   const char    * const string) {
 
        match_t * m = regex_match(regex, string, true);
-       const bool r = (m->position != -1);
+       const bool r = !is_sentinel(m);
        free(m);
 
        return r;