]> git.xolatile.top Git - public-libhl.git/commitdiff
adopted whitelist/blacklist logic
authoranon <anon@anon.anon>
Mon, 28 Aug 2023 13:42:42 +0000 (15:42 +0200)
committeranon <anon@anon.anon>
Mon, 28 Aug 2023 13:44:14 +0000 (15:44 +0200)
source/regex.c

index 05ab4461316195f4f953828f96b1459ebd6a04d4..da047a393fff2a1f966034d10f6d675c759aa17d 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <assert.h>
 #include <string.h>
+#include <limits.h>
 
 // ------------------
 // ### Char tests ###
@@ -54,6 +55,7 @@ typedef struct {
        int     * state;
        int     * width;
        char    * whitelist;
+       char    * blacklist;
        regex_t * regex;
 } compiler_state;
 
@@ -62,50 +64,52 @@ typedef struct {
 // ----------------------------------
 // ### Regex creation/destruction ###
 // ----------------------------------
-static int escape_1_to_1(const char c, char * whitelist) {
+static int escape_1_to_1(const char c, compiler_state * cs) {
+       char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
        switch (c) {
                case 't': {
-                       strcat(whitelist, "\t");
+                       strcat(target_list, "\t");
                } return 1;
                case 'n': {
-                       strcat(whitelist, "\n");
+                       strcat(target_list, "\n");
                } return 1;
                case 'r': {
-                       strcat(whitelist, "\r");
+                       strcat(target_list, "\r");
                } return 1;
                case 'b': {
-                       strcat(whitelist, "\b");
+                       strcat(target_list, "\b");
                } return 1;
                case '[': {
-                       strcat(whitelist, "[");
+                       strcat(target_list, "[");
                } return 1;
                case ']': {
-                       strcat(whitelist, "]");
+                       strcat(target_list, "]");
                } return 1;
                case '.': {
-                       strcat(whitelist, ".");
+                       strcat(target_list, ".");
                } return 1;
                case '=': {
-                       strcat(whitelist, "=");
+                       strcat(target_list, "=");
                } return 1;
                case '?': {
-                       strcat(whitelist, "?");
+                       strcat(target_list, "?");
                } return 1;
                case '+': {
-                       strcat(whitelist, "+");
+                       strcat(target_list, "+");
                } return 1;
                case '*': {
-                       strcat(whitelist, "*");
+                       strcat(target_list, "*");
                } return 1;
                case '\\': {
-                       strcat(whitelist, "\\");
+                       strcat(target_list, "\\");
                } return 1;
        }
 
        return 0;
 }
 
-static int escape_1_to_N(const char c, char * whitelist) {
+static int escape_1_to_N(const char c, compiler_state * cs) {
+       char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
        switch(c) {
                case 'i': {
                        const char identifier_chars[] = "@0123456789_"
@@ -116,7 +120,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
                                                        "\324\325\326\327\330"
                                                        "\331\332\333\334\335"
                                                        "\336\337";
-                       strcpy(whitelist, identifier_chars);
+                       strcpy(target_list, identifier_chars);
                        return sizeof(identifier_chars)-1;
                };
                case 'I': {
@@ -128,7 +132,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
                                                        "\324\325\326\327\330"
                                                        "\331\332\333\334\335"
                                                        "\336\337";
-                       strcpy(whitelist, identifier_chars);
+                       strcpy(target_list, identifier_chars);
                        return sizeof(identifier_chars)-1;
                };
                case 'k': {
@@ -140,7 +144,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
                                                     "\324\325\326\327\330"
                                                     "\331\332\333\334\335"
                                                     "\336\337";
-                       strcpy(whitelist, keyword_chars);
+                       strcpy(target_list, keyword_chars);
                        return sizeof(keyword_chars)-1;
                };
                case 'K': {
@@ -152,17 +156,17 @@ static int escape_1_to_N(const char c, char * whitelist) {
                                                     "\324\325\326\327\330"
                                                     "\331\332\333\334\335"
                                                     "\336\337";
-                       strcpy(whitelist, keyword_chars);
+                       strcpy(target_list, keyword_chars);
                        return sizeof(keyword_chars)-1;
                };
                case 'f': {
                        const char filename_chars[] = "@0123456789/.-_+,#$%~=";
-                       strcpy(whitelist, filename_chars);
+                       strcpy(target_list, filename_chars);
                        return sizeof(filename_chars)-1;
                };
                case 'F': {
                        const char filename_chars[] = "@/.-_+,#$%~=";
-                       strcpy(whitelist, filename_chars);
+                       strcpy(target_list, filename_chars);
                        return sizeof(filename_chars)-1;
                };
                case 'p': {
@@ -181,7 +185,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
                                                       "\324\325\326\327\330"
                                                       "\331\332\333\334\335"
                                                       "\336\337";
-                       strcpy(whitelist, printable_chars);
+                       strcpy(target_list, printable_chars);
                        return sizeof(printable_chars)-1;
                };
                case 'P': {
@@ -200,29 +204,29 @@ static int escape_1_to_N(const char c, char * whitelist) {
                                                       "\324\325\326\327\330"
                                                       "\331\332\333\334\335"
                                                       "\336\337";
-                       strcpy(whitelist, printable_chars);
+                       strcpy(target_list, printable_chars);
                        return sizeof(printable_chars)-1;
                };
                case 's': {
                        const char whitespace_chars[] = " \t\v\n";
-                       strcpy(whitelist, whitespace_chars);
+                       strcpy(target_list, whitespace_chars);
                        return sizeof(whitespace_chars)-1;
                };
                case 'd': {
                        const char digit_chars[] = "0123456789";
-                       strcpy(whitelist, digit_chars);
+                       strcpy(target_list, digit_chars);
                        return sizeof(digit_chars)-1;
                };
                case 'x': {
                        const char hex_chars[] = "0123456789"
                                                 "abcdef"
                                                 "ABCDEF";
-                       strcpy(whitelist, hex_chars);
+                       strcpy(target_list, hex_chars);
                        return sizeof(hex_chars)-1;
                };
                case 'o': {
                        const char oct_chars[] = "01234567";
-                       strcpy(whitelist, oct_chars);
+                       strcpy(target_list, oct_chars);
                        return sizeof(oct_chars)-1;
                };
                case 'w': {
@@ -230,30 +234,30 @@ static int escape_1_to_N(const char c, char * whitelist) {
                                                  "abcdefghijklmnopqrstuwxyz"
                                                  "ABCDEFGHIJKLMNOPQRSTUWXYZ"
                                                  "_";
-                       strcpy(whitelist, word_chars);
+                       strcpy(target_list, word_chars);
                        return sizeof(word_chars)-1;
                };
                case 'h': {
                        const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
                                                       "ABCDEFGHIJKLMNOPQRSTUWXYZ"
                                                       "_";
-                       strcpy(whitelist, very_word_chars);
+                       strcpy(target_list, very_word_chars);
                        return sizeof(very_word_chars)-1;
                };
                case 'a': {
                        const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
                                                   "ABCDEFGHIJKLMNOPQRSTUWXYZ";
-                       strcpy(whitelist, alpha_chars);
+                       strcpy(target_list, alpha_chars);
                        return sizeof(alpha_chars)-1;
                };
                case 'l': {
                        const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
-                       strcpy(whitelist, lower_alpha_chars);
+                       strcpy(target_list, lower_alpha_chars);
                        return sizeof(lower_alpha_chars)-1;
                };
                case 'u': {
                        const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
-                       strcpy(whitelist, upper_alpha_chars);
+                       strcpy(target_list, upper_alpha_chars);
                        return sizeof(upper_alpha_chars)-1;
                };
        }
@@ -291,34 +295,47 @@ static int compile_range(const char * const     range,
        for (; *s != ']'; s++) {
                assert((*s != '\0') && "Unclosed range.");
                char c = *s;
-               if (escape_1_to_1(c, whitelist)
-               ||  escape_1_to_N(c, whitelist)) {
-                       ;
+               if (c == '\\') {
+                       s += 1;
+                       assert(compile_escape(*s, cs) && "Unknown escape.");
                } else if (*(s+1) == '-') {
                        char end = *(s+2);
                        assert((c < end) && "Endless range.");
                        for (char cc = c; cc < end+1; cc++) {
-                               strncat(whitelist,   &cc, 1);
-                               strncat(whitelist, "\0", 1);
+                               strncat(target_list,  &cc, 1);
+                               strncat(target_list, "\0", 1);
                        }
                        s += 2;
                } else {
-                       strncat(whitelist,    &c, 1);
-                       strncat(whitelist, "\00", 1);
+                       strncat(target_list,   &c, 1);
                }
        }
 
        return ((s - range) + 1);
 }
 
-#define HALT_AND_CATCH_FIRE -1
+void filter_blacklist(const char * const whitelist,
+                      const char * const blacklist,
+                            char * const  filtered) {
+       for (char * black_pointer = blacklist; *black_pointer != '\0'; black_pointer++) {
+               for(char * white_pointer = blacklist; *white_pointer != '\0'; white_pointer++) {
+                       if (*black_pointer == *white_pointer) {
+                               goto long_continue;
+                       }
+               }
+               strncat(filtered, black_pointer, 1);
+               long_continue:
+       }
+}
 
-void HOOK_ALL(int              from,
-               const char * const      str,
-                     int                to,
-                     compiler_state *   cs) {
+#define HALT_AND_CATCH_FIRE INT_MIN
 
-       int hook_to = (*cs->is_negative) ? HALT_AND_CATCH_FIRE : *cs->state + to;
+void HOOK_ALL(      int              from,
+              const char * const      str,
+                    int                to,
+                    compiler_state *   cs) {
+
+       int hook_to = (to == HALT_AND_CATCH_FIRE) ? -1 : ((*cs->state) + to);
 
 
        for (const char * s = str; *s != '\0'; s++) {
@@ -330,18 +347,17 @@ void HOOK_ALL(int              from,
                vector_push(&cs->regex->delta_table,
                            &delta);
        }
-       if (*cs->do_catch || *cs->is_negative) {
-               offshoot_t * offshoot = malloc(sizeof(offshoot_t));
-               offshoot->in = *cs->state + from; 
-               offshoot->to   = hook_to;
-               vector_push(&cs->regex->catch_table,
-                           &offshoot);
-       }
 }
 
-#define EAT(n) do { \
-       s += n;         \
-} while (0)
+void OFFSHOOT(int             from,
+              int               to,
+              compiler_state *  cs) {
+       offshoot_t * offshoot = malloc(sizeof(offshoot_t));
+       offshoot->in = *cs->state + from; 
+       offshoot->to = *cs->state + to;
+       vector_push(&cs->regex->catch_table,
+                   &offshoot);
+}
 
 regex_t * regex_compile(const char * const pattern) {
        regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
@@ -355,6 +371,7 @@ regex_t * regex_compile(const char * const pattern) {
        bool is_negative;
        int width;
        char whitelist[64];
+       char blacklist[64];
 
        compiler_state cs = {
                .do_catch    = &do_catch,
@@ -362,35 +379,30 @@ regex_t * regex_compile(const char * const pattern) {
                .state       = &state,
                .width       = &width,
                .whitelist   = whitelist,
+               .blacklist   = blacklist,
                .regex       = regex,
        };
 
        for (const char * s = pattern; *s != '\00';) {
-               // Get token
+               // Reset the compiler
                assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
                whitelist[0] = '\00';
+               blacklist[0] = '\00';
                do_catch     = false;
                is_negative  = false;
                width        = 1;
 
+               // Translate char
                switch (*s) {
                        case '.': {
-                               do_catch = true;
+                               compile_dot(&cs);
                        } break;
                        case '\\': {
-                               //if (compile_hologram(*s, whitelist)) {
-                               //      break;
-                               //}
-                               EAT(1);
-                               if(escape_1_to_1(*s, whitelist)
-                               || escape_1_to_N(*s, whitelist)){
-                                       ;
-                               } else {
-                                       assert(!"Unknown escape.");
-                               }
+                               s += 1;
+                               assert(compile_escape(*s, &cs) && "Unknown escape.");
                        } break;
                        case '[': {
-                               EAT(compile_range(s, whitelist, &is_negative)-1);
+                               s += compile_range(s, &cs) - 1;
                        } break;
                        default: {
                                whitelist[0] = *s;
@@ -398,30 +410,55 @@ regex_t * regex_compile(const char * const pattern) {
                        } break;
                }
 
-               EAT(1);
+               s += 1;
 
-               // Get quantifier
+               // Compile with quantifier
                switch (*s) {
                        case '=':
                        case '?': {
                                HOOK_ALL(0, whitelist, +1, &cs);
-                               EAT(1);
+                               if (do_catch || is_negative) {
+                                       OFFSHOOT(0, +1, &cs);
+                               }
+                               s += 1;
                        } break;
                        case '*': {
                                HOOK_ALL(0, whitelist,  0, &cs);
-                               EAT(1);
+                               if (do_catch) {
+                                       OFFSHOOT(0, +1, &cs);
+                               } else if (is_negative) {
+                                       OFFSHOOT(0,  0, &cs);
+                               }
+                               s += 1;
                        } break;
                        case '+': {
                                HOOK_ALL(0, whitelist, +1, &cs);
+                               if (do_catch || is_negative) {
+                                       OFFSHOOT(0, +1, &cs);
+                               }
                                state += 1;
                                HOOK_ALL(0, whitelist,  0, &cs);
-                               EAT(1);
+                               if (do_catch || is_negative) {
+                                       OFFSHOOT(0, 0, &cs);
+                               }
+                               s += 1;
                        } break;
                        default: { // Literal
                                HOOK_ALL(0, whitelist, +1, &cs);
+                               if (do_catch || is_negative) {
+                                       OFFSHOOT(0, +1, &cs);
+                               }
                                state += 1;
                        } break;
                }
+
+               // Compile blacklist
+               if (*blacklist) {
+                       char filtered_blacklist[64];
+                       filtered_blacklist[0] = '\0'; 
+                       filter_blacklist(whitelist, blacklist, filtered_blacklist);
+                       HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
+               }
        }
 
        regex->accepting_state = state;