#include <string.h>
#include <limits.h>
#include <stdlib.h>
+#if DEBUG
+# include <stdio.h>
+#endif
-#define JEGER_INIT_STATE 2
+#define JEGER_SOS_STATE 0
+#define JEGER_NSOS_STATE 1
+#define JEGER_INIT_STATE 2
// ------------------
// ### Char tests ###
;
}
+// -------------------
+// ### Match tests ###
+// -------------------
+bool is_sentinel(const match_t * const match) {
+ return (match->position == -1)
+ && (match->width == -1)
+ ;
+}
+
// -----------------
// ### Char sets ###
// -----------------
#define JEGER_CHAR_SET_lower_hex "abcdef"
#define JEGER_CHAR_SET_upper_hex "ABCDEF"
#define JEGER_CHAR_SET_oct_241_to_277 \
- "\241\242\243\244\245" \
- "\246\247\250\251\252" \
- "\253\254\255\256\257" \
- "\260\261\262\263\264" \
- "\265\266\267\270\271" \
- "\272\273\274\275\276" \
- "\277"
+ "\241\242\243\244\245" \
+ "\246\247\250\251\252" \
+ "\253\254\255\256\257" \
+ "\260\261\262\263\264" \
+ "\265\266\267\270\271" \
+ "\272\273\274\275\276" \
+ "\277"
#define JEGER_CHAR_SET_oct_300_to_337 \
"\300\301\302\303\304" \
"\305\306\307\310\311" \
"\331\332\333\334\335" \
"\336\337"
#define JEGER_CHAR_SET_file_extra "/.-_+,#$%~="
-#define JEGER_CHAR_SET_whitespace " \t\v\n"
+#define JEGER_CHAR_SET_whitespace " " "\t\v\n"
-static const char JEGER_CHAR_very_word_chars[] =
- JEGER_CHAR_SET_underscore
- JEGER_CHAR_SET_lower
- JEGER_CHAR_SET_upper
- ;
+static const char JEGER_CHAR_symbol_chars[] =
+ JEGER_CHAR_SET_underscore
+ JEGER_CHAR_SET_lower
+ JEGER_CHAR_SET_upper
+ ;
// ----------------------
// ### Internal Types ###
} offshoot_t;
enum {
- DO_CATCH = 0x00000001 << 0,
- IS_NEGATIVE = 0x00000001 << 1,
- IS_AT_THE_BEGINNING = 0x00000001 << 2,
- FORCE_START_OF_STRING = 0x00000001 << 3,
- INCREMENT_STATE = 0x00000001 << 4,
+ DO_CATCH = 0x00000001 << 0,
+ IS_NEGATIVE = 0x00000001 << 1,
+ IS_AT_THE_BEGINNING = 0x00000001 << 2,
+ FORCE_START_OF_STRING = 0x00000001 << 3,
+ DO_FORBID_START_OF_STRING = 0x00000001 << 4,
+ INCREMENT_STATE = 0x00000001 << 5,
};
typedef struct {
int flags;
int state;
int width;
+ int width2;
char * whitelist;
char * blacklist;
} compiler_state;
.input = *s,
.to = ASSERT_HALT(to),
.pattern_width = cs->width,
- .match_width = 1,
+ .match_width = cs->width2,
};
vector_push(®ex->delta_table,
&delta);
return sizeof(word_chars)-1;
};
case 'h': {
- // #global JEGER_CHAR_very_word_chars
- strcpy(target_list, JEGER_CHAR_very_word_chars);
- return sizeof(JEGER_CHAR_very_word_chars)-1;
+ // #global JEGER_CHAR_symbol_chars
+ strcpy(target_list, JEGER_CHAR_symbol_chars);
+ return sizeof(JEGER_CHAR_symbol_chars)-1;
};
case 'a': {
const char alpha_chars[] = JEGER_CHAR_SET_lower
static inline
int escape_to_negative(const char c,
- compiler_state * const cs) {
+ compiler_state * const cs) {
switch (c) {
case 'D': {
const char digit_chars[] = JEGER_CHAR_SET_digits;
blacklist[0] = '\0';
cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
cs.width = 1;
+ cs.width2 = 1;
// Translate char
switch (*s) {
if (compile_escape(*s, &cs)) {
s += 1;
} else if (is_hologram_escape(*s)) {
- ;
+ s -= 1;
} else {
assert("Unknown escape.");
}
} break;
}
+ /* Ew */
+ if (*s == '\\'
+ && is_hologram_escape(*(s+1))) {
+ ++s;
+ }
+
// Compile char
switch (*s) {
// holograms
s += 1;
} break;
case '<': {
- cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
- if (cs.flags & IS_AT_THE_BEGINNING) {
- ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex);
+ // XXX: make this legible
+ if (cs.flags & IS_AT_THE_BEGINNING
+ && !(cs.flags & DO_CATCH)
+ && !(cs.flags & IS_NEGATIVE)
+ && whitelist[0] == '\0') {
+ // ---
+ cs.flags |= INCREMENT_STATE;
+ cs.flags |= DO_FORBID_START_OF_STRING;
+ strcat(whitelist, JEGER_CHAR_symbol_chars);
+ // ---
+ ABSOLUTE_OFFSHOOT( JEGER_SOS_STATE, JEGER_INIT_STATE+1, 0, 0, regex);
+ ABSOLUTE_OFFSHOOT(JEGER_INIT_STATE, JEGER_INIT_STATE+2, 1, 0, regex);
+ HOOK_ALL(0, whitelist, HALT_AND_CATCH_FIRE, &cs, regex);
+ // ---
+ ++cs.state;
+ cs.width = 0;
+ cs.width2 = 0;
+ HOOK_ALL(0, whitelist, +1, &cs, regex);
+ cs.width = 1;
+ OFFSHOOT(0, +1, 1, 0, &cs, regex);
+ // ---
+ } else {
+ HOOK_ALL(0, whitelist, +1, &cs, regex);
+ if ((cs.flags & DO_CATCH)
+ || (cs.flags & IS_NEGATIVE)) {
+ OFFSHOOT(+1, +2, 1, 1, &cs, regex);
+ } else {
+ cs.flags |= INCREMENT_STATE;
+ }
+ OFFSHOOT(0, +1, 1, 0, &cs, regex);
}
- strcat(blacklist, JEGER_CHAR_very_word_chars);
- OFFSHOOT(0, 0, 1, 0, &cs, regex);
+ cs.flags |= IS_NEGATIVE;
+ strcat(blacklist, JEGER_CHAR_symbol_chars);
s += 1;
} break;
case '>': {
+ HOOK_ALL(0, whitelist, +1, &cs, regex);
cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
- strcat(blacklist, JEGER_CHAR_very_word_chars);
- OFFSHOOT(0, 1, 0, 0, &cs, regex);
+ strcat(blacklist, JEGER_CHAR_symbol_chars);
+ OFFSHOOT(+1, +2, 0, 0, &cs, regex);
+ ++cs.state;
s += 1;
} break;
// quantifiers
}
// Init state hookups
- ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE, 0, 0, regex);
+ if (!(cs.flags & DO_FORBID_START_OF_STRING)) {
+ ABSOLUTE_OFFSHOOT(JEGER_SOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
+ }
if (cs.flags & FORCE_START_OF_STRING) {
- ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, regex);
+ ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, HALT_AND_CATCH_FIRE, 0, 0, regex);
} else {
- ABSOLUTE_OFFSHOOT(1, JEGER_INIT_STATE, 0, 0, regex);
+ ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
}
regex->accepting_state = cs.state;
if ((delta->in == state)
&& (delta->input == *s)) {
+ bool do_reset = false;
was_found = true;
+ if (!match->_pos_ptr && delta->match_width) {
+ match->_pos_ptr = s;
+ do_reset = true;
+ }
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
if(r){
- if (match->position == -1) {
- match->position = (s - string);
- }
match->width += delta->match_width;
return r;
+ } else if (do_reset) {
+ match->_pos_ptr = NULL;
}
}
}
// Find all matches
{
const char * s = string;
+ int initial_state;
do {
- int initial_state;
initial_state = (int)(!(is_start_of_string && (s == string)));
*match = (match_t){
- .position = -1,
- .width = 0,
+ ._pos_ptr = NULL,
+ .width = 0,
};
if (regex_assert(regex, s, initial_state, match)) {
- match->position = (s - string);
+ if (match->_pos_ptr) {
+ match->position = (match->_pos_ptr - string);
+ } else {
+ match->position = (s - string);
+ }
vector_push(&matches, match);
const char * const string) {
match_t * m = regex_match(regex, string, true);
- const bool r = (m->position != -1);
+ const bool r = !is_sentinel(m);
free(m);
return r;