#include <assert.h>
#include <string.h>
+#include <limits.h>
// ------------------
// ### Char tests ###
int * state;
int * width;
char * whitelist;
+ char * blacklist;
regex_t * regex;
} compiler_state;
// ----------------------------------
// ### Regex creation/destruction ###
// ----------------------------------
-static int escape_1_to_1(const char c, char * whitelist) {
+static int escape_1_to_1(const char c, compiler_state * cs) {
+ char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
switch (c) {
case 't': {
- strcat(whitelist, "\t");
+ strcat(target_list, "\t");
} return 1;
case 'n': {
- strcat(whitelist, "\n");
+ strcat(target_list, "\n");
} return 1;
case 'r': {
- strcat(whitelist, "\r");
+ strcat(target_list, "\r");
} return 1;
case 'b': {
- strcat(whitelist, "\b");
+ strcat(target_list, "\b");
} return 1;
case '[': {
- strcat(whitelist, "[");
+ strcat(target_list, "[");
} return 1;
case ']': {
- strcat(whitelist, "]");
+ strcat(target_list, "]");
} return 1;
case '.': {
- strcat(whitelist, ".");
+ strcat(target_list, ".");
} return 1;
case '=': {
- strcat(whitelist, "=");
+ strcat(target_list, "=");
} return 1;
case '?': {
- strcat(whitelist, "?");
+ strcat(target_list, "?");
} return 1;
case '+': {
- strcat(whitelist, "+");
+ strcat(target_list, "+");
} return 1;
case '*': {
- strcat(whitelist, "*");
+ strcat(target_list, "*");
} return 1;
case '\\': {
- strcat(whitelist, "\\");
+ strcat(target_list, "\\");
} return 1;
}
return 0;
}
-static int escape_1_to_N(const char c, char * whitelist) {
+static int escape_1_to_N(const char c, compiler_state * cs) {
+ char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
switch(c) {
case 'i': {
const char identifier_chars[] = "@0123456789_"
"\324\325\326\327\330"
"\331\332\333\334\335"
"\336\337";
- strcpy(whitelist, identifier_chars);
+ strcpy(target_list, identifier_chars);
return sizeof(identifier_chars)-1;
};
case 'I': {
"\324\325\326\327\330"
"\331\332\333\334\335"
"\336\337";
- strcpy(whitelist, identifier_chars);
+ strcpy(target_list, identifier_chars);
return sizeof(identifier_chars)-1;
};
case 'k': {
"\324\325\326\327\330"
"\331\332\333\334\335"
"\336\337";
- strcpy(whitelist, keyword_chars);
+ strcpy(target_list, keyword_chars);
return sizeof(keyword_chars)-1;
};
case 'K': {
"\324\325\326\327\330"
"\331\332\333\334\335"
"\336\337";
- strcpy(whitelist, keyword_chars);
+ strcpy(target_list, keyword_chars);
return sizeof(keyword_chars)-1;
};
case 'f': {
const char filename_chars[] = "@0123456789/.-_+,#$%~=";
- strcpy(whitelist, filename_chars);
+ strcpy(target_list, filename_chars);
return sizeof(filename_chars)-1;
};
case 'F': {
const char filename_chars[] = "@/.-_+,#$%~=";
- strcpy(whitelist, filename_chars);
+ strcpy(target_list, filename_chars);
return sizeof(filename_chars)-1;
};
case 'p': {
"\324\325\326\327\330"
"\331\332\333\334\335"
"\336\337";
- strcpy(whitelist, printable_chars);
+ strcpy(target_list, printable_chars);
return sizeof(printable_chars)-1;
};
case 'P': {
"\324\325\326\327\330"
"\331\332\333\334\335"
"\336\337";
- strcpy(whitelist, printable_chars);
+ strcpy(target_list, printable_chars);
return sizeof(printable_chars)-1;
};
case 's': {
const char whitespace_chars[] = " \t\v\n";
- strcpy(whitelist, whitespace_chars);
+ strcpy(target_list, whitespace_chars);
return sizeof(whitespace_chars)-1;
};
case 'd': {
const char digit_chars[] = "0123456789";
- strcpy(whitelist, digit_chars);
+ strcpy(target_list, digit_chars);
return sizeof(digit_chars)-1;
};
case 'x': {
const char hex_chars[] = "0123456789"
"abcdef"
"ABCDEF";
- strcpy(whitelist, hex_chars);
+ strcpy(target_list, hex_chars);
return sizeof(hex_chars)-1;
};
case 'o': {
const char oct_chars[] = "01234567";
- strcpy(whitelist, oct_chars);
+ strcpy(target_list, oct_chars);
return sizeof(oct_chars)-1;
};
case 'w': {
"abcdefghijklmnopqrstuwxyz"
"ABCDEFGHIJKLMNOPQRSTUWXYZ"
"_";
- strcpy(whitelist, word_chars);
+ strcpy(target_list, word_chars);
return sizeof(word_chars)-1;
};
case 'h': {
const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
"ABCDEFGHIJKLMNOPQRSTUWXYZ"
"_";
- strcpy(whitelist, very_word_chars);
+ strcpy(target_list, very_word_chars);
return sizeof(very_word_chars)-1;
};
case 'a': {
const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
"ABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(whitelist, alpha_chars);
+ strcpy(target_list, alpha_chars);
return sizeof(alpha_chars)-1;
};
case 'l': {
const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
- strcpy(whitelist, lower_alpha_chars);
+ strcpy(target_list, lower_alpha_chars);
return sizeof(lower_alpha_chars)-1;
};
case 'u': {
const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(whitelist, upper_alpha_chars);
+ strcpy(target_list, upper_alpha_chars);
return sizeof(upper_alpha_chars)-1;
};
}
for (; *s != ']'; s++) {
assert((*s != '\0') && "Unclosed range.");
char c = *s;
- if (escape_1_to_1(c, whitelist)
- || escape_1_to_N(c, whitelist)) {
- ;
+ if (c == '\\') {
+ s += 1;
+ assert(compile_escape(*s, cs) && "Unknown escape.");
} else if (*(s+1) == '-') {
char end = *(s+2);
assert((c < end) && "Endless range.");
for (char cc = c; cc < end+1; cc++) {
- strncat(whitelist, &cc, 1);
- strncat(whitelist, "\0", 1);
+ strncat(target_list, &cc, 1);
+ strncat(target_list, "\0", 1);
}
s += 2;
} else {
- strncat(whitelist, &c, 1);
- strncat(whitelist, "\00", 1);
+ strncat(target_list, &c, 1);
}
}
return ((s - range) + 1);
}
-#define HALT_AND_CATCH_FIRE -1
+void filter_blacklist(const char * const whitelist,
+ const char * const blacklist,
+ char * const filtered) {
+ for (char * black_pointer = blacklist; *black_pointer != '\0'; black_pointer++) {
+ for(char * white_pointer = blacklist; *white_pointer != '\0'; white_pointer++) {
+ if (*black_pointer == *white_pointer) {
+ goto long_continue;
+ }
+ }
+ strncat(filtered, black_pointer, 1);
+ long_continue:
+ }
+}
-void HOOK_ALL(int from,
- const char * const str,
- int to,
- compiler_state * cs) {
+#define HALT_AND_CATCH_FIRE INT_MIN
- int hook_to = (*cs->is_negative) ? HALT_AND_CATCH_FIRE : *cs->state + to;
+void HOOK_ALL( int from,
+ const char * const str,
+ int to,
+ compiler_state * cs) {
+
+ int hook_to = (to == HALT_AND_CATCH_FIRE) ? -1 : ((*cs->state) + to);
for (const char * s = str; *s != '\0'; s++) {
vector_push(&cs->regex->delta_table,
&delta);
}
- if (*cs->do_catch || *cs->is_negative) {
- offshoot_t * offshoot = malloc(sizeof(offshoot_t));
- offshoot->in = *cs->state + from;
- offshoot->to = hook_to;
- vector_push(&cs->regex->catch_table,
- &offshoot);
- }
}
-#define EAT(n) do { \
- s += n; \
-} while (0)
+void OFFSHOOT(int from,
+ int to,
+ compiler_state * cs) {
+ offshoot_t * offshoot = malloc(sizeof(offshoot_t));
+ offshoot->in = *cs->state + from;
+ offshoot->to = *cs->state + to;
+ vector_push(&cs->regex->catch_table,
+ &offshoot);
+}
regex_t * regex_compile(const char * const pattern) {
regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
bool is_negative;
int width;
char whitelist[64];
+ char blacklist[64];
compiler_state cs = {
.do_catch = &do_catch,
.state = &state,
.width = &width,
.whitelist = whitelist,
+ .blacklist = blacklist,
.regex = regex,
};
for (const char * s = pattern; *s != '\00';) {
- // Get token
+ // Reset the compiler
assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
whitelist[0] = '\00';
+ blacklist[0] = '\00';
do_catch = false;
is_negative = false;
width = 1;
+ // Translate char
switch (*s) {
case '.': {
- do_catch = true;
+ compile_dot(&cs);
} break;
case '\\': {
- //if (compile_hologram(*s, whitelist)) {
- // break;
- //}
- EAT(1);
- if(escape_1_to_1(*s, whitelist)
- || escape_1_to_N(*s, whitelist)){
- ;
- } else {
- assert(!"Unknown escape.");
- }
+ s += 1;
+ assert(compile_escape(*s, &cs) && "Unknown escape.");
} break;
case '[': {
- EAT(compile_range(s, whitelist, &is_negative)-1);
+ s += compile_range(s, &cs) - 1;
} break;
default: {
whitelist[0] = *s;
} break;
}
- EAT(1);
+ s += 1;
- // Get quantifier
+ // Compile with quantifier
switch (*s) {
case '=':
case '?': {
HOOK_ALL(0, whitelist, +1, &cs);
- EAT(1);
+ if (do_catch || is_negative) {
+ OFFSHOOT(0, +1, &cs);
+ }
+ s += 1;
} break;
case '*': {
HOOK_ALL(0, whitelist, 0, &cs);
- EAT(1);
+ if (do_catch) {
+ OFFSHOOT(0, +1, &cs);
+ } else if (is_negative) {
+ OFFSHOOT(0, 0, &cs);
+ }
+ s += 1;
} break;
case '+': {
HOOK_ALL(0, whitelist, +1, &cs);
+ if (do_catch || is_negative) {
+ OFFSHOOT(0, +1, &cs);
+ }
state += 1;
HOOK_ALL(0, whitelist, 0, &cs);
- EAT(1);
+ if (do_catch || is_negative) {
+ OFFSHOOT(0, 0, &cs);
+ }
+ s += 1;
} break;
default: { // Literal
HOOK_ALL(0, whitelist, +1, &cs);
+ if (do_catch || is_negative) {
+ OFFSHOOT(0, +1, &cs);
+ }
state += 1;
} break;
}
+
+ // Compile blacklist
+ if (*blacklist) {
+ char filtered_blacklist[64];
+ filtered_blacklist[0] = '\0';
+ filter_blacklist(whitelist, blacklist, filtered_blacklist);
+ HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
+ }
}
regex->accepting_state = state;