]> git.xolatile.top Git - public-libhl.git/commitdiff
new regex engine
authoranon <anon@anon.anon>
Wed, 23 Aug 2023 16:38:12 +0000 (18:38 +0200)
committeranon <anon@anon.anon>
Wed, 23 Aug 2023 16:38:12 +0000 (18:38 +0200)
source/hl.h
source/regex2.hpp [new file with mode: 0644]
tests/regex_tester.cpp [new file with mode: 0644]

index dcf2894a2f871e14e6c043e765398fbf85bec864..b64a77c0bcafc206a9797a5e6dfcdc785ff57de0 100644 (file)
@@ -48,7 +48,6 @@ int token_table_top = 0;
 // --------------------------------
 // ### Constructors/Destructors ###
 // --------------------------------
-
 void new_display_mode(display_t * mode) {
        HASH_ADD_STR(display_table,
                     key,
diff --git a/source/regex2.hpp b/source/regex2.hpp
new file mode 100644 (file)
index 0000000..25badf3
--- /dev/null
@@ -0,0 +1,230 @@
+#include <vector>
+#include <assert.h>
+#include <string.h>
+
+typedef struct {
+       int in;
+       char input;
+       int to;
+} delta_t;
+
+typedef struct {
+       int in;
+       int to;
+} offshoot_t;
+
+typedef struct {
+       char * str;
+       std::vector<delta_t> delta_table;
+       std::vector<offshoot_t> catch_table;
+       int accepting_state;
+} regex_t;
+
+#define HALT_AND_CATCH_FIRE -1
+
+#define HOOK_ALL(from, str, to) do {                   \
+       for (char * s = str; *s != '\00'; s++) {           \
+               reg.delta_table.push_back(                     \
+                       delta_t{state + from, *s, state + to}      \
+               );                                             \
+       }                                                  \
+       if (do_catch) {                                    \
+               reg.catch_table.push_back(                     \
+                       {state + from, state + to}                 \
+               );                                             \
+       }                                                  \
+} while (0)
+
+#define EAT(n) do { \
+       s += n;         \
+} while (0)
+
+bool is_quantifier(const char c){
+       for (const char * s = "+*?"; *s != '\00'; s++) {
+               if (*s == c) {
+                       return true;
+               }
+       }
+       return false;
+}
+
+
+int escape_1_to_1(const char c, char * whitelist) {
+       switch(c) {
+               case 't': {
+                       strcpy(whitelist, "\t");
+               } return 1;
+               case 'n': {
+                       strcpy(whitelist, "\n");
+               } return 1;
+       }
+
+       return 0;
+}
+
+int escape_1_to_N(const char c, char * whitelist) {
+       switch(c) {
+               case 'd': {
+                       const char digitchars[] = "0123456789";
+                       strcpy(whitelist, digitchars);
+                       return sizeof(digitchars)-1;
+               };
+               case 'w': {
+                       const char wordchars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ";
+                       strcpy(whitelist, wordchars);
+                       return sizeof(wordchars)-1;
+               };
+               case 's': {
+                       const char blankchars[] = " \t\v\n";
+                       strcpy(whitelist, blankchars);
+                       return sizeof(blankchars)-1;
+               };
+       }
+
+       return 0;
+}
+
+int compile_range(const char * const     range,
+                        char *       whitelist) {
+       assert(range[0] == '[' && "Not a range.");
+
+       int r = 0;
+       const char * s;
+       for (s = range+1; *s != ']'; s++) {
+               assert(*s != '\00' && "Unclosed range.");
+               char c = *s;
+               if (*(s+1) == '-') {
+                       char end = *(s+2);
+                       assert(c < end && "Endless range.");
+                       for (char cc = c; cc < end+1; cc++) {
+                               strncat(whitelist,   &cc, 1);
+                               strncat(whitelist, "\00", 1);
+                       }
+                       s += 2;
+               } else {
+                       ++r;
+                       strncat(whitelist,    &c, 1);
+                       strncat(whitelist, "\00", 1);
+               }
+       }
+
+       return ((s - range) + 1);
+}
+
+regex_t * regex_compile(const char * const pattern) {
+       regex_t * r = new regex_t;
+       regex_t &reg = *r;
+       reg.str = strdup(pattern);
+
+       int state = 0;
+
+       char whitelist[64];
+       bool do_catch;
+       for (const char * s = pattern; *s != '\00';) {
+               // Get token
+               assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
+               whitelist[0] = '\00';
+               do_catch     = false;
+               switch (*s) {
+                       case '.': {
+                               do_catch = true;
+                       } break;
+                       case '\\': {
+                               EAT(1);
+                               if(escape_1_to_1(*s, whitelist)
+                               || escape_1_to_N(*s, whitelist)){
+                                       ;
+                               } else {
+                                       assert(!"Unknown escape.");
+                               }
+                       } break;
+                       case '[': {
+                               EAT(compile_range(s, whitelist)-1);
+                       } break;
+                       default: {
+                               whitelist[0] = *s;
+                               whitelist[1] = '\00';
+                       } break;
+               }
+
+               EAT(1);
+
+               // Quantifier
+               switch (*s) {
+                       case '?': {
+                               HOOK_ALL(0, whitelist, +1);
+                               EAT(1);
+                       } break;
+                       case '*': {
+                               HOOK_ALL(0, whitelist,  0);
+                               EAT(1);
+                       } break;
+                       case '+': {
+                               HOOK_ALL(0, whitelist, +1);
+                               state += 1;
+                               HOOK_ALL(0, whitelist,  0);
+                               EAT(1);
+                       } break;
+                       default: { // Literal
+                               HOOK_ALL(0, whitelist, +1);
+                               state += 1;
+                       } break;
+               }
+       }
+
+       reg.accepting_state = state;
+
+       return r;
+}
+
+inline bool catch_(const regex_t * regex,
+                        int     & state) {
+
+       const regex_t &reg = *regex;
+       for (int i = 0; i < reg.catch_table.size(); i++){
+               if (reg.catch_table[i].in == state) {
+                       state = reg.catch_table[i].to;
+                       return true;
+               }
+       }
+       return false;
+}
+
+bool regex_assert(const regex_t * const  regex,
+                  const char    * const string,
+                                       int              state) {
+
+       const regex_t &reg = *regex;
+       for (const char * s = string; *s != '\00'; s++) {
+               // delta
+               for (int i = 0; i < reg.delta_table.size(); i++) {
+                       if ((reg.delta_table[i].in == state) 
+                       &&  (reg.delta_table[i].input == *s)) {
+                               if(regex_assert(regex, s+1, reg.delta_table[i].to)){
+                                       return true;
+                               }
+                       }
+               }
+
+               if (catch_(regex, state)) {
+                       continue;
+               }
+
+               return false;
+       }
+
+       return (state == regex->accepting_state);
+}
+
+bool regex_search(      regex_t *        regex,
+                  const char    * const string) {
+
+       if (regex == NULL) {
+               return false;
+       }
+       if (string == NULL) {
+               return true;
+       }
+
+       return regex_assert(regex, string, 0);
+}
diff --git a/tests/regex_tester.cpp b/tests/regex_tester.cpp
new file mode 100644 (file)
index 0000000..b3ccf06
--- /dev/null
@@ -0,0 +1,76 @@
+// @COMPILECMD g++ $@ -o regtest -O0 -ggdb -pg -fno-inline
+#include <stdio.h>
+#include "regex.hpp"
+
+#define TEST(a, b, expected) do { \
+       r = regex_compile(a); \
+       bool result = regex_search(r, b); \
+       bool passed = (result == expected); \
+       if (passed) { printf("Success.  - "); } else { printf("Failiour. - "); } \
+       printf("%s vs %s: Result = %d, Expected = %d\n", #a, #b, result, expected); \
+       ++num_tests; \
+       if (passed) { ++passed_tests; } \
+} while(0)
+
+signed main() {
+       int num_tests = 0;
+       int passed_tests = 0;
+       regex_t * r;
+
+       TEST(R"del(abc)del","abc",true);
+       TEST(R"del(efg1)del","efg1",true);
+       TEST(R"del(nig)del","ger",false);
+
+       puts("");
+
+       TEST(R"del(ab+c)del","abc",true);
+       TEST(R"del(ef+g1)del","effffg1",true);
+       TEST(R"del(ni*g?)del","ngg",false);
+
+       puts("");
+
+       TEST(R"del(ne.)del","net",true);
+       TEST(R"del(ne.)del","ne",false);
+       TEST(R"del(ne.+)del","neoo",true);
+
+       puts("");
+
+       TEST(R"del(ne.)del","ne\t",true);
+       TEST(R"del(ne\t)del","ne",false);
+       TEST(R"del(ne\t)del","ne\t",true);
+
+       puts("");
+
+       TEST(R"del(\sa)del"," a",true);
+       TEST(R"del(\wi)del","hi",true);
+       TEST(R"del(\w+)del","asd",true);
+
+       puts("");
+
+       TEST(R"del([A-Za-z]+)del","HelloWorld",true);
+       TEST(R"del([A-Za-z]+g)del","HelloWorldg",true);
+       TEST(R"del([A-Za-z]+g)del","g",false);
+
+       puts("");
+
+       TEST(R"del(a+a)del","aaa",true);
+       TEST(R"del(a+a)del","aa",true);
+       TEST(R"del(a+a)del","a",false);
+
+       //++num_tests; TEST(R"del(\d{3})del","123",true);
+       //++num_tests; TEST(R"del(^\w+@\w+\.\w+$)del","example@email.com",true);
+
+       //++num_tests; TEST(R"del(\b\w+\b)del","This is a test",true);
+       //++num_tests; TEST(R"del(^[A-Za-z]+\s\d+)del","OpenAI 123",true);
+       //++num_tests; TEST(R"del([0-9]{4}-[0-9]{2}-[0-9]{2})del","2023-08-22",true);
+
+       //++num_tests; TEST(R"del(^[^abc]+$)del","def123",true);
+       //++num_tests; TEST(R"del(\b\d{5}\b)del","12345 67890",true);
+       //++num_tests; TEST(R"del(^[A-Z][a-z]+$)del","OpenAI",true);
+
+       //++num_tests; TEST(R"del(\d{3}-\d{2}-\d{4})del","123-45-6789",true);
+       //++num_tests; TEST(R"del(^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})del","192.168.1.1",true);
+       //++num_tests; TEST(R"del(^\w{8,12})del","Password123", false);
+
+       printf("\nPassed %d out of %d tests.\n", passed_tests, num_tests);
+}