-#include "regex.h"
+#include <assert.h>
+#include <string.h>
+#include "vector.h"
-bool is_case_on = true;
+typedef struct {
+ int in;
+ char input;
+ int to;
+} delta_t;
-static bool is_next_valid(const char * const s) {
- return *(s + 1);
-}
+typedef struct {
+ int in;
+ int to;
+} offshoot_t;
-static bool char_in_range(const char start,
- const char end,
- const char character) {
- if (start > end){
- return false;
- }
+typedef struct {
+ char * str;
+ vector_t delta_table; // <delta_t>
+ vector_t catch_table; // <offshoot_t>
+ int accepting_state;
+} regex_t;
+
+#define HALT_AND_CATCH_FIRE -1
- for (char c = start; c != end; c++) {
- if (character == c) {
+#define HOOK_ALL(from, str, to) do { \
+ for (char * s = str; *s != '\00'; s++) { \
+ vector_push(regex->delta_table \
+ (delta_t *){state + from, *s, state + to} \
+ ); \
+ } \
+ if (do_catch) { \
+ vector_push(regex->catch_table \
+ (offshoot_t *){state + from, state + to} \
+ ); \
+ } \
+} while (0)
+
+#define EAT(n) do { \
+ s += n; \
+} while (0)
+
+bool is_quantifier(const char c){
+ for (const char * s = "+*?"; *s != '\00'; s++) {
+ if (*s == c) {
return true;
}
}
-
return false;
}
-static bool is_word_separator(const char character) {
- return (( isascii(character))
- && (!isalnum(character))
- && ( character != '_'));
+
+int escape_1_to_1(const char c, char * whitelist) {
+ switch(c) {
+ case 't': {
+ strcat(whitelist, "\t");
+ } return 1;
+ case 'n': {
+ strcat(whitelist, "\n");
+ } return 1;
+ case 'r': {
+ strcat(whitelist, "\r");
+ } return 1;
+ case 'b': {
+ strcat(whitelist, "\b");
+ } return 1;
+ case '[': {
+ strcat(whitelist, "[");
+ } return 1;
+ case ']': {
+ strcat(whitelist, "]");
+ } return 1;
+ case '.': {
+ strcat(whitelist, ".");
+ } return 1;
+ case '?': {
+ strcat(whitelist, "?");
+ } return 1;
+ case '+': {
+ strcat(whitelist, "+");
+ } return 1;
+ case '*': {
+ strcat(whitelist, "*");
+ } return 1;
+ case '\\': {
+ strcat(whitelist, "\\");
+ } return 1;
+ }
+
+ return 0;
}
-static bool magic(const char magic_char, const char to_enchant) {
- switch(magic_char){
- // \i identifier character (see 'isident' option)
- // \I like "\i", but excluding digits
- // \k keyword character (see 'iskeyword' option)
- // \K like "\k", but excluding digits
- // \f file name character (see 'isfname' option)
- // \F like "\f", but excluding digits
- // \p printable character (see 'isprint' option)
- // \P like "\p", but excluding digits
- case 's': {
- return ((to_enchant == ' ') || (to_enchant == '\t'));
- }
- case 'S': {
- return !((to_enchant == ' ') || (to_enchant == '\t'));
- }
- case 'd': { // [0-9]
- return char_in_range('0', '9', to_enchant);
+int escape_1_to_N(const char c, char * whitelist) {
+ switch(c) {
+ case 'i': {
+ const char identifier_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
+ strcpy(whitelist, identifier_chars);
+ return sizeof(identifier_chars)-1;
+ };
+ case 'I': {
+ const char identifier_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
+ strcpy(whitelist, identifier_chars);
+ return sizeof(identifier_chars)-1;
};
- case 'D': { // [^0-9]
- return !char_in_range('0', '9', to_enchant);
+ case 'k': {
+ const char keyword_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
+ strcpy(whitelist, keyword_chars);
+ return sizeof(keyword_chars)-1;
};
- case 'x': { // [0-9A-Fa-f]
- return char_in_range('0', '9', to_enchant) || char_in_range('A', 'F', to_enchant) || char_in_range('a', 'f', to_enchant);
+ case 'K': {
+ const char keyword_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
+ strcpy(whitelist, keyword_chars);
+ return sizeof(keyword_chars)-1;
};
- case 'X': { // [^0-9A-Fa-f]
- return !char_in_range('0', '9', to_enchant) && !char_in_range('A', 'F', to_enchant) && !char_in_range('a', 'f', to_enchant);
+ case 'f': {
+ const char filename_chars[] = "@0123456789/.-_+,#$%~=";
+ strcpy(whitelist, keyword_chars);
+ return sizeof(keyword_chars)-1;
};
- case 'o': { // [0-7]
- return char_in_range('0', '7', to_enchant);
+ case 'F': {
+ const char filename_chars[] = "@/.-_+,#$%~=";
+ strcpy(whitelist, keyword_chars);
+ return sizeof(keyword_chars)-1;
};
- case 'O': { // [^0-7]
- return !char_in_range('0', '7', to_enchant);
+ case 'p': {
+ const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
+ strcpy(whitelist, printable_chars);
+ return sizeof(printable_chars)-1;
};
- case 'w': { // [0-9A-Za-z_]
- return char_in_range('0', '9', to_enchant) || char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_');
+ case 'P': {
+ const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
+ strcpy(whitelist, printable_chars);
+ return sizeof(printable_chars)-1;
};
- case 'W': { // [^0-9A-Za-z_]
- return !(char_in_range('0', '9', to_enchant) || char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_'));
+ case 's': {
+ const char whitespace_chars[] = " \t\v\n";
+ strcpy(whitelist, whitespace_chars);
+ return sizeof(whitespace_chars)-1;
};
- case 'h': { // [A-Za-z_]
- return char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_');
+ case 'd': {
+ const char digit_chars[] = "0123456789";
+ strcpy(whitelist, digit_chars);
+ return sizeof(digit_chars)-1;
};
- case 'H': { // [^A-Za-z_]
- return !(char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_'));
+ case 'x': {
+ const char hex_chars[] = "0123456789abcdefABCDEF";
+ strcpy(whitelist, hex_chars);
+ return sizeof(hex_chars)-1;
};
- case 'a': { // [A-Za-z]
- return char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant);
+ case 'o': {
+ const char oct_chars[] = "01234567";
+ strcpy(whitelist, oct_chars);
+ return sizeof(oct_chars)-1;
};
- case 'A': { // [A-Za-z]
- return !(char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant));
+ case 'w': {
+ const char word_chars[] = "0123456789abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_";
+ strcpy(whitelist, word_chars);
+ return sizeof(word_chars)-1;
};
- case 'l': { // [a-z]
- return char_in_range('a', 'z', to_enchant);
+ case 'h': {
+ const char very_word_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_";
+ strcpy(whitelist, very_word_chars);
+ return sizeof(very_word_chars)-1;
};
- case 'L': { // [^a-z]
- return !(char_in_range('a', 'z', to_enchant));
+ case 'a': {
+ const char alpha_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ";
+ strcpy(whitelist, alpha_chars);
+ return sizeof(alpha_chars)-1;
};
- case 'u': { // [A-Z]
- return char_in_range('A', 'Z', to_enchant);
+ case 'l': {
+ const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
+ strcpy(whitelist, lower_alpha_chars);
+ return sizeof(lower_alpha_chars)-1;
};
- case 'U': { // [^A-Z]
- return !(char_in_range('A', 'Z', to_enchant));
+ case 'u': {
+ const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
+ strcpy(whitelist, upper_alpha_chars);
+ return sizeof(upper_alpha_chars)-1;
};
}
- return false;
+ return 0;
}
-int regex_match(const char * const pattern,
- const char * const string_start,
- const int string_offset,
- int * match_offset_) {
- const char * pattern_pointer = pattern;
- const char * string_pointer = string_start + string_offset;
- const char * const match_base = string_pointer;
- int match_offset = 0;
-
- while (1488) {
- // End of one of the arguments
- if (!(*pattern_pointer)) {
- break;
- }
- if (!(*string_pointer)) {
- return false;
- }
+int compile_range(const char * const range,
+ char * whitelist) {
+ assert(range[0] == '[' && "Not a range.");
- // Escape character
- if (*pattern_pointer == '\\') {
- if (!is_next_valid(pattern_pointer)) {
- return false;
+ int r = 0;
+ const char * s;
+ for (s = range+1; *s != ']'; s++) {
+ assert(*s != '\00' && "Unclosed range.");
+ char c = *s;
+ if (escape_1_to_1(c, whitelist)
+ || escape_1_to_N(c, whitelist)) {
+ ;
+ } else if (*(s+1) == '-') {
+ char end = *(s+2);
+ assert(c < end && "Endless range.");
+ for (char cc = c; cc < end+1; cc++) {
+ strncat(whitelist, &cc, 1);
+ strncat(whitelist, "\00", 1);
}
+ s += 2;
+ } else {
+ ++r;
+ strncat(whitelist, &c, 1);
+ strncat(whitelist, "\00", 1);
+ }
+ }
- switch(*(pattern_pointer + 1)){
- case 't': {
- if (*(string_pointer + 1) == '\t') {
- pattern_pointer += 2;
- string_pointer += 1;
- } else {
- return false;
- }
- } break;
- case 'r': {
- if (*(string_pointer + 1) == '\r') {
- pattern_pointer += 2;
- string_pointer += 1;
- } else {
- return false;
- }
- } break;
- case 'e': {
- if (*(string_pointer + 1) == '\033') {
- pattern_pointer += 2;
- string_pointer += 1;
- } else {
- return false;
- }
- } break;
- case 'b': {
- if (*(string_pointer + 1) == '\010') {
- pattern_pointer += 2;
- string_pointer += 1;
- } else {
- return false;
- }
- } break;
- }
+ return ((s - range) + 1);
+}
- if (*(pattern_pointer + 1) == '\\') {
- if (*string_pointer == '\\') {
- pattern_pointer += 2;
- string_pointer += 1;
- continue;
- }
- }
+regex_t * regex_compile(const char * const pattern) {
+ regex_t * r = new regex_t;
+ regex->str = strdup(pattern);
+ vector_init(regex->delta_table, sizeof(delta_t), 32);
+ vector_init(regex->catch_table, sizeof(offshoot_t), 16);
- if (*(pattern_pointer + 1) == '<') {
- if (is_word_separator(*string_pointer)) {
- pattern_pointer += 2;
- string_pointer += 1;
- match_offset += 1;
- continue;
- } else if (string_pointer == string_start) {
- pattern_pointer += 2;
- continue;
- }
- }
+ int state = 0;
- if (*(pattern_pointer + 1) == '>') {
- if (is_word_separator(*string_pointer)) {
- pattern_pointer += 2;
- continue;
- }
- if (*(string_pointer + 1) == '\00') {
- break;
+ char whitelist[64];
+ bool do_catch;
+ for (const char * s = pattern; *s != '\00';) {
+ // Get token
+ assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
+ whitelist[0] = '\00';
+ do_catch = false;
+ switch (*s) {
+ case '.': {
+ do_catch = true;
+ } break;
+ case '\\': {
+ EAT(1);
+ if(escape_1_to_1(*s, whitelist)
+ || escape_1_to_N(*s, whitelist)){
+ ;
+ } else {
+ assert(!"Unknown escape.");
}
- }
+ } break;
+ case '[': {
+ EAT(compile_range(s, whitelist)-1);
+ } break;
+ default: {
+ whitelist[0] = *s;
+ whitelist[1] = '\00';
+ } break;
+ }
- if (magic(*(pattern_pointer + 1), *string_pointer)) {
- pattern_pointer += 2;
- string_pointer += 1;
- continue;
- }
+ EAT(1);
- return false;
+ // Quantifier
+ switch (*s) {
+ case '?': {
+ HOOK_ALL(0, whitelist, +1);
+ EAT(1);
+ } break;
+ case '*': {
+ HOOK_ALL(0, whitelist, 0);
+ EAT(1);
+ } break;
+ case '+': {
+ HOOK_ALL(0, whitelist, +1);
+ state += 1;
+ HOOK_ALL(0, whitelist, 0);
+ EAT(1);
+ } break;
+ default: { // Literal
+ HOOK_ALL(0, whitelist, +1);
+ state += 1;
+ } break;
}
+ }
- // Literal
- if (*pattern_pointer != *string_pointer) {
- return false;
- } else {
- ++pattern_pointer;
- ++string_pointer;
+ regex->accepting_state = state;
+
+ return r;
+}
+
+int regex_free(regex_t * const regex) {
+ free(regex->str);
+ vector_free(regex->delta_table);
+ vector_free(regex->catch_table);
+ return 0;
+}
+
+inline bool catch_(const regex_t * const regex,
+ int & state) {
+
+ for (int i = 0; i < regex->catch_table->element_size; i++){
+ const offshoot_t * const offshoot = (offshoot *)(vector_get(reg.catch_table, i));
+ if (offshoot->in == state) {
+ state = offshoot->to;
+ return true;
}
}
+ return false;
+}
+
+bool regex_assert(const regex_t * const regex,
+ const char * const string,
+ int state) {
- if (match_offset_) {
- *match_offset_ = match_offset;
+ for (const char * s = string; *s != '\00'; s++) {
+ // delta
+ for (int i = 0; i < regex->delta_table.element_count; i++) {
+ const delta_t * const delta = (delta_t *)(vector_get(reg.delta_table, i);
+ if (delta->in == state)
+ && (delta->input == *s)) {
+ if(regex_assert(regex, s+1, delta->to)){
+ return true;
+ }
+ }
+ }
+
+ if (catch_(regex, state)) {
+ continue;
+ }
+
+ return false;
+ }
+
+ return (state == regex->accepting_state);
+}
+
+bool regex_search( regex_t * regex,
+ const char * const string) {
+
+ if (regex == NULL) {
+ return false;
+ }
+ if (string == NULL) {
+ return true;
}
- return (string_pointer - match_base) - match_offset;
+
+ return regex_assert(regex, string, 0);
}
extern bool is_case_on;
-extern int regex_match(const char * const pattern, const char * const string, const int string_offset, int * match_offset_);
+extern regex_t * regex_compile(const char * const pattern);
+extern int regex_match(const char * const pattern, const char * const string, const int string_offset, int * match_offset_);
+extern int regex_free(regex_t * const regex);
+++ /dev/null
-#include <assert.h>
-#include <string.h>
-
-typedef struct {
- int in;
- char input;
- int to;
-} delta_t;
-
-typedef struct {
- int in;
- int to;
-} offshoot_t;
-
-typedef struct {
- char * str;
- std::vector<delta_t> delta_table;
- std::vector<offshoot_t> catch_table;
- int accepting_state;
-} regex_t;
-
-#define HALT_AND_CATCH_FIRE -1
-
-#define HOOK_ALL(from, str, to) do { \
- for (char * s = str; *s != '\00'; s++) { \
- reg.delta_table.push_back( \
- delta_t{state + from, *s, state + to} \
- ); \
- } \
- if (do_catch) { \
- reg.catch_table.push_back( \
- {state + from, state + to} \
- ); \
- } \
-} while (0)
-
-#define EAT(n) do { \
- s += n; \
-} while (0)
-
-bool is_quantifier(const char c){
- for (const char * s = "+*?"; *s != '\00'; s++) {
- if (*s == c) {
- return true;
- }
- }
- return false;
-}
-
-
-int escape_1_to_1(const char c, char * whitelist) {
- switch(c) {
- case 't': {
- strcat(whitelist, "\t");
- } return 1;
- case 'n': {
- strcat(whitelist, "\n");
- } return 1;
- case 'r': {
- strcat(whitelist, "\r");
- } return 1;
- case 'b': {
- strcat(whitelist, "\b");
- } return 1;
- case '[': {
- strcat(whitelist, "[");
- } return 1;
- case ']': {
- strcat(whitelist, "]");
- } return 1;
- case '.': {
- strcat(whitelist, ".");
- } return 1;
- case '?': {
- strcat(whitelist, "?");
- } return 1;
- case '+': {
- strcat(whitelist, "+");
- } return 1;
- case '*': {
- strcat(whitelist, "*");
- } return 1;
- case '\\': {
- strcat(whitelist, "\\");
- } return 1;
- }
-
- return 0;
-}
-
-int escape_1_to_N(const char c, char * whitelist) {
- switch(c) {
- case 'i': {
- const char identifier_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, identifier_chars);
- return sizeof(identifier_chars)-1;
- };
- case 'I': {
- const char identifier_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, identifier_chars);
- return sizeof(identifier_chars)-1;
- };
- case 'k': {
- const char keyword_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'K': {
- const char keyword_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'f': {
- const char filename_chars[] = "@0123456789/.-_+,#$%~=";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'F': {
- const char filename_chars[] = "@/.-_+,#$%~=";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'p': {
- const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, printable_chars);
- return sizeof(printable_chars)-1;
- };
- case 'P': {
- const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, printable_chars);
- return sizeof(printable_chars)-1;
- };
- case 's': {
- const char whitespace_chars[] = " \t\v\n";
- strcpy(whitelist, whitespace_chars);
- return sizeof(whitespace_chars)-1;
- };
- case 'd': {
- const char digit_chars[] = "0123456789";
- strcpy(whitelist, digit_chars);
- return sizeof(digit_chars)-1;
- };
- case 'x': {
- const char hex_chars[] = "0123456789abcdefABCDEF";
- strcpy(whitelist, hex_chars);
- return sizeof(hex_chars)-1;
- };
- case 'o': {
- const char oct_chars[] = "01234567";
- strcpy(whitelist, oct_chars);
- return sizeof(oct_chars)-1;
- };
- case 'w': {
- const char word_chars[] = "0123456789abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_";
- strcpy(whitelist, word_chars);
- return sizeof(word_chars)-1;
- };
- case 'h': {
- const char very_word_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_";
- strcpy(whitelist, very_word_chars);
- return sizeof(very_word_chars)-1;
- };
- case 'a': {
- const char alpha_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(whitelist, alpha_chars);
- return sizeof(alpha_chars)-1;
- };
- case 'l': {
- const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
- strcpy(whitelist, lower_alpha_chars);
- return sizeof(lower_alpha_chars)-1;
- };
- case 'u': {
- const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(whitelist, upper_alpha_chars);
- return sizeof(upper_alpha_chars)-1;
- };
- }
-
- return 0;
-}
-
-int compile_range(const char * const range,
- char * whitelist) {
- assert(range[0] == '[' && "Not a range.");
-
- int r = 0;
- const char * s;
- for (s = range+1; *s != ']'; s++) {
- assert(*s != '\00' && "Unclosed range.");
- char c = *s;
- if (escape_1_to_1(c, whitelist)
- || escape_1_to_N(c, whitelist)) {
- ;
- } else if (*(s+1) == '-') {
- char end = *(s+2);
- assert(c < end && "Endless range.");
- for (char cc = c; cc < end+1; cc++) {
- strncat(whitelist, &cc, 1);
- strncat(whitelist, "\00", 1);
- }
- s += 2;
- } else {
- ++r;
- strncat(whitelist, &c, 1);
- strncat(whitelist, "\00", 1);
- }
- }
-
- return ((s - range) + 1);
-}
-
-regex_t * regex_compile(const char * const pattern) {
- regex_t * r = new regex_t;
- regex_t ® = *r;
- reg.str = strdup(pattern);
-
- int state = 0;
-
- char whitelist[64];
- bool do_catch;
- for (const char * s = pattern; *s != '\00';) {
- // Get token
- assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
- whitelist[0] = '\00';
- do_catch = false;
- switch (*s) {
- case '.': {
- do_catch = true;
- } break;
- case '\\': {
- EAT(1);
- if(escape_1_to_1(*s, whitelist)
- || escape_1_to_N(*s, whitelist)){
- ;
- } else {
- assert(!"Unknown escape.");
- }
- } break;
- case '[': {
- EAT(compile_range(s, whitelist)-1);
- } break;
- default: {
- whitelist[0] = *s;
- whitelist[1] = '\00';
- } break;
- }
-
- EAT(1);
-
- // Quantifier
- switch (*s) {
- case '?': {
- HOOK_ALL(0, whitelist, +1);
- EAT(1);
- } break;
- case '*': {
- HOOK_ALL(0, whitelist, 0);
- EAT(1);
- } break;
- case '+': {
- HOOK_ALL(0, whitelist, +1);
- state += 1;
- HOOK_ALL(0, whitelist, 0);
- EAT(1);
- } break;
- default: { // Literal
- HOOK_ALL(0, whitelist, +1);
- state += 1;
- } break;
- }
- }
-
- reg.accepting_state = state;
-
- return r;
-}
-
-inline bool catch_(const regex_t * regex,
- int & state) {
-
- const regex_t ® = *regex;
- for (int i = 0; i < reg.catch_table.size(); i++){
- if (reg.catch_table[i].in == state) {
- state = reg.catch_table[i].to;
- return true;
- }
- }
- return false;
-}
-
-bool regex_assert(const regex_t * const regex,
- const char * const string,
- int state) {
-
- const regex_t ® = *regex;
- for (const char * s = string; *s != '\00'; s++) {
- // delta
- for (int i = 0; i < reg.delta_table.size(); i++) {
- if ((reg.delta_table[i].in == state)
- && (reg.delta_table[i].input == *s)) {
- if(regex_assert(regex, s+1, reg.delta_table[i].to)){
- return true;
- }
- }
- }
-
- if (catch_(regex, state)) {
- continue;
- }
-
- return false;
- }
-
- return (state == regex->accepting_state);
-}
-
-bool regex_search( regex_t * regex,
- const char * const string) {
-
- if (regex == NULL) {
- return false;
- }
- if (string == NULL) {
- return true;
- }
-
- return regex_assert(regex, string, 0);
-}
+++ /dev/null
-#include <assert.h>
-#include <string.h>
-#include "vector.h"
-
-typedef struct {
- int in;
- char input;
- int to;
-} delta_t;
-
-typedef struct {
- int in;
- int to;
-} offshoot_t;
-
-typedef struct {
- char * str;
- vector_t delta_table; // <delta_t>
- vector_t catch_table; // <offshoot_t>
- int accepting_state;
-} regex_t;
-
-#define HALT_AND_CATCH_FIRE -1
-
-#define HOOK_ALL(from, str, to) do { \
- for (char * s = str; *s != '\00'; s++) { \
- vector_push(®.delta_table \
- (delta_t *){state + from, *s, state + to} \
- ); \
- } \
- if (do_catch) { \
- vector_push(®.catch_table \
- (offshoot_t *){state + from, state + to} \
- ); \
- } \
-} while (0)
-
-#define EAT(n) do { \
- s += n; \
-} while (0)
-
-bool is_quantifier(const char c){
- for (const char * s = "+*?"; *s != '\00'; s++) {
- if (*s == c) {
- return true;
- }
- }
- return false;
-}
-
-
-int escape_1_to_1(const char c, char * whitelist) {
- switch(c) {
- case 't': {
- strcat(whitelist, "\t");
- } return 1;
- case 'n': {
- strcat(whitelist, "\n");
- } return 1;
- case 'r': {
- strcat(whitelist, "\r");
- } return 1;
- case 'b': {
- strcat(whitelist, "\b");
- } return 1;
- case '[': {
- strcat(whitelist, "[");
- } return 1;
- case ']': {
- strcat(whitelist, "]");
- } return 1;
- case '.': {
- strcat(whitelist, ".");
- } return 1;
- case '?': {
- strcat(whitelist, "?");
- } return 1;
- case '+': {
- strcat(whitelist, "+");
- } return 1;
- case '*': {
- strcat(whitelist, "*");
- } return 1;
- case '\\': {
- strcat(whitelist, "\\");
- } return 1;
- }
-
- return 0;
-}
-
-int escape_1_to_N(const char c, char * whitelist) {
- switch(c) {
- case 'i': {
- const char identifier_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, identifier_chars);
- return sizeof(identifier_chars)-1;
- };
- case 'I': {
- const char identifier_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, identifier_chars);
- return sizeof(identifier_chars)-1;
- };
- case 'k': {
- const char keyword_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'K': {
- const char keyword_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'f': {
- const char filename_chars[] = "@0123456789/.-_+,#$%~=";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'F': {
- const char filename_chars[] = "@/.-_+,#$%~=";
- strcpy(whitelist, keyword_chars);
- return sizeof(keyword_chars)-1;
- };
- case 'p': {
- const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, printable_chars);
- return sizeof(printable_chars)-1;
- };
- case 'P': {
- const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337";
- strcpy(whitelist, printable_chars);
- return sizeof(printable_chars)-1;
- };
- case 's': {
- const char whitespace_chars[] = " \t\v\n";
- strcpy(whitelist, whitespace_chars);
- return sizeof(whitespace_chars)-1;
- };
- case 'd': {
- const char digit_chars[] = "0123456789";
- strcpy(whitelist, digit_chars);
- return sizeof(digit_chars)-1;
- };
- case 'x': {
- const char hex_chars[] = "0123456789abcdefABCDEF";
- strcpy(whitelist, hex_chars);
- return sizeof(hex_chars)-1;
- };
- case 'o': {
- const char oct_chars[] = "01234567";
- strcpy(whitelist, oct_chars);
- return sizeof(oct_chars)-1;
- };
- case 'w': {
- const char word_chars[] = "0123456789abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_";
- strcpy(whitelist, word_chars);
- return sizeof(word_chars)-1;
- };
- case 'h': {
- const char very_word_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_";
- strcpy(whitelist, very_word_chars);
- return sizeof(very_word_chars)-1;
- };
- case 'a': {
- const char alpha_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(whitelist, alpha_chars);
- return sizeof(alpha_chars)-1;
- };
- case 'l': {
- const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
- strcpy(whitelist, lower_alpha_chars);
- return sizeof(lower_alpha_chars)-1;
- };
- case 'u': {
- const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
- strcpy(whitelist, upper_alpha_chars);
- return sizeof(upper_alpha_chars)-1;
- };
- }
-
- return 0;
-}
-
-int compile_range(const char * const range,
- char * whitelist) {
- assert(range[0] == '[' && "Not a range.");
-
- int r = 0;
- const char * s;
- for (s = range+1; *s != ']'; s++) {
- assert(*s != '\00' && "Unclosed range.");
- char c = *s;
- if (escape_1_to_1(c, whitelist)
- || escape_1_to_N(c, whitelist)) {
- ;
- } else if (*(s+1) == '-') {
- char end = *(s+2);
- assert(c < end && "Endless range.");
- for (char cc = c; cc < end+1; cc++) {
- strncat(whitelist, &cc, 1);
- strncat(whitelist, "\00", 1);
- }
- s += 2;
- } else {
- ++r;
- strncat(whitelist, &c, 1);
- strncat(whitelist, "\00", 1);
- }
- }
-
- return ((s - range) + 1);
-}
-
-regex_t * regex_compile(const char * const pattern) {
- regex_t * r = new regex_t;
- regex_t ® = *r;
- reg.str = strdup(pattern);
-
- int state = 0;
-
- char whitelist[64];
- bool do_catch;
- for (const char * s = pattern; *s != '\00';) {
- // Get token
- assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
- whitelist[0] = '\00';
- do_catch = false;
- switch (*s) {
- case '.': {
- do_catch = true;
- } break;
- case '\\': {
- EAT(1);
- if(escape_1_to_1(*s, whitelist)
- || escape_1_to_N(*s, whitelist)){
- ;
- } else {
- assert(!"Unknown escape.");
- }
- } break;
- case '[': {
- EAT(compile_range(s, whitelist)-1);
- } break;
- default: {
- whitelist[0] = *s;
- whitelist[1] = '\00';
- } break;
- }
-
- EAT(1);
-
- // Quantifier
- switch (*s) {
- case '?': {
- HOOK_ALL(0, whitelist, +1);
- EAT(1);
- } break;
- case '*': {
- HOOK_ALL(0, whitelist, 0);
- EAT(1);
- } break;
- case '+': {
- HOOK_ALL(0, whitelist, +1);
- state += 1;
- HOOK_ALL(0, whitelist, 0);
- EAT(1);
- } break;
- default: { // Literal
- HOOK_ALL(0, whitelist, +1);
- state += 1;
- } break;
- }
- }
-
- reg.accepting_state = state;
-
- return r;
-}
-
-inline bool catch_(const regex_t * regex,
- int & state) {
-
- const regex_t ® = *regex;
- for (int i = 0; i < reg.catch_table.size(); i++){
- if (reg.catch_table[i].in == state) {
- state = reg.catch_table[i].to;
- return true;
- }
- }
- return false;
-}
-
-bool regex_assert(const regex_t * const regex,
- const char * const string,
- int state) {
-
- const regex_t ® = *regex;
- for (const char * s = string; *s != '\00'; s++) {
- // delta
- for (int i = 0; i < reg.delta_table.size(); i++) {
- const delta_t * const delta = (delta_t *)(vector_get(reg.delta_table, i);
- if (delta->in == state)
- && (delta->input == *s)) {
- if(regex_assert(regex, s+1, delta->to)){
- return true;
- }
- }
- }
-
- if (catch_(regex, state)) {
- continue;
- }
-
- return false;
- }
-
- return (state == regex->accepting_state);
-}
-
-bool regex_search( regex_t * regex,
- const char * const string) {
-
- if (regex == NULL) {
- return false;
- }
- if (string == NULL) {
- return true;
- }
-
- return regex_assert(regex, string, 0);
-}