This commit is contained in:
anon
2023-11-18 01:46:10 +01:00
parent b47bd13141
commit 7a6e287566
12 changed files with 382 additions and 298 deletions

@ -4,6 +4,7 @@ ifeq ($(DEBUG), 1)
LFLAGS += --debug --trace
CXXFLAGS += -Wall -Wextra -Wpedantic
CXXFLAGS += -DDEBUG -O0 -ggdb -pg -fno-inline
.PHONY: ${OUT}
else
CXXFLAGS += -O3 -fno-stack-protector -fno-exceptions -fno-rtti
endif
@ -15,7 +16,7 @@ OBJECT.d := object/
TEST.d := test/
INSTALL.d := /bin/
SOURCE := main.cpp xml.cpp csml.cpp cli.cpp global.cpp
SOURCE := main.cpp xml.cpp csml.cpp cli.cpp global.cpp html_special.cpp
OBJECT := $(addprefix ${OBJECT.d}/,${SOURCE})
OBJECT := ${OBJECT:.cpp=.o}
OBJECT := ${OBJECT:.c=.o}

@ -20,6 +20,7 @@ const char * const help_message =
" -h : print help and quit\n"
;
extern "C"
signed parse_round1_arguments(int argc, char * * argv){
const char * const optstring = "-" "hv" "cxi:o:q:";

@ -3,6 +3,7 @@
/* Parse arguments with perminant effects (-h)
* Perform validation.
*/
extern "C"
signed parse_round1_arguments(int argc, char * * argv);
#define CLI_H

@ -5,10 +5,11 @@
#include <stack>
#include <string>
#include "html_special.hpp"
#include "global.hpp"
std::stack<std::string> tag_stack;
std::string tag_candidate = "";
static std::stack<std::string> tag_stack;
static std::string tag_candidate = "";
static
void _ECHO_CANDIDATE(){
@ -25,6 +26,9 @@ static const char COMMENT_END[] = "-->";
static const char ATTRIBUTE_VALUE_START[] = "'";
static const char ATTRIBUTE_VALUE_END[] = "'";
static unsigned short current_unicode_size;
static
bool push_tag() {
if (tag_candidate == "") {
exit(3);
@ -38,6 +42,7 @@ bool push_tag() {
return true;
}
static
bool pop_tag() {
if (tag_stack.empty()) {
exit(3);
@ -50,6 +55,7 @@ bool pop_tag() {
return true;
}
%}
%option noyywrap
@ -60,10 +66,12 @@ bool pop_tag() {
%x COMMENT COMMENT_MULTILINE
%x IGNORE IGNORE_COUNT_START IGNORE_COUNT_END
%x STRING
%s UNICODE
ws [ \t\r\v\f]
wsnl [ \t\r\v\f\n]
identifier [A-z][A-z0-9]*
unicode [\300-\364]
%%
@ -87,6 +95,11 @@ identifier [A-z][A-z0-9]*
ECHOS(("<" + tag_stack.top() + " ").c_str());
BEGIN HEAD;
}
&#?{identifier}; {
ECHO_CANDIDATE;
tag_candidate = "";
ECHO;
}
; {
ECHOS(("<" + tag_candidate + "/>").c_str());
tag_candidate = "";
@ -95,6 +108,7 @@ identifier [A-z][A-z0-9]*
push_tag();
ECHOS(("<" + tag_stack.top() + ">").c_str());
if (do_ignore(tag_stack.top())) {
buffer = std::string("");
BEGIN IGNORE_COUNT_START;
}
}
@ -113,6 +127,19 @@ identifier [A-z][A-z0-9]*
tag_candidate = "";
ECHOS("&gt;");
}
{unicode} {
ECHO_CANDIDATE;
tag_candidate = "";
const char mask = 0b100000000;
const char &header = yytext[0];
current_unicode_size = 2;
for (int i = 2; (mask >> i) & header; i++) {
++current_unicode_size;
}
yyless(0);
BEGIN UNICODE;
}
.|{wsnl} {
ECHO;
}
@ -177,7 +204,7 @@ identifier [A-z][A-z0-9]*
++ignore_count;
}
.|\n {
ECHO;
BUFFER(yytext);
BEGIN IGNORE;
}
}
@ -188,6 +215,7 @@ identifier [A-z][A-z0-9]*
ignore_i = 0;
ignore_count = 1;
ECHOS(buffer.c_str());
ECHOS(("</" + tag_stack.top() + ">").c_str());
pop_tag();
BEGIN BODY;
@ -195,10 +223,10 @@ identifier [A-z][A-z0-9]*
}
.|\n {
while (ignore_i--) {
ECHOC('}');
BUFFER('}');
}
ignore_i = 1;
ECHO;
BUFFER(yytext);
BEGIN IGNORE;
}
}
@ -211,13 +239,26 @@ identifier [A-z][A-z0-9]*
if (ignore_count != 1) {
BEGIN IGNORE_COUNT_END;
} else {
ECHOS(buffer.c_str());
ECHOS(("</" + tag_stack.top() + ">").c_str());
pop_tag();
BEGIN BODY;
}
}
.|\n {
ECHO;
BUFFER(yytext);
}
}
<UNICODE>{
(.|\n){4} {
static char current_unicode[5];
memcpy(current_unicode, yytext, 5);
current_unicode[current_unicode_size] = '\0';
yyless(4 - current_unicode_size);
ECHOS(utf8_to_html_special(current_unicode));
BEGIN BODY;
}
}

@ -2,23 +2,29 @@
#include <string.h>
#include <string>
#include <vector>
void trim(char * s) {
std::vector<std::string> ignore_list;
int ignore_count = 1;
int ignore_i = 1;
std::string buffer;
void trim(char * const s) {
int bp = 0;
int len = strlen(s);
bool do_break = false;
int i = 0;
for (;i < len; i++) {
if ((s[i] >= 'A' && s[i] <= 'Z')
|| (s[i] >= 'a' && s[i] <= 'z')
|| (s[i] >= '0' && s[i] <= '9')
|| (s[i] == '_')) {
s[bp++] = s[i];
for (;i < len; i++) {
if ((s[i] >= 'A' && s[i] <= 'Z')
|| (s[i] >= 'a' && s[i] <= 'z')
|| (s[i] >= '0' && s[i] <= '9')
|| (s[i] == '_')) {
s[bp++] = s[i];
do_break = true;
} else if (do_break) {
} else if (do_break) {
break;
}
}
}
}
s[bp] = '\0';
}

40
source/html_special.cpp Normal file

@ -0,0 +1,40 @@
#include "html_special.hpp"
const size_t html_special_table_size =
sizeof(html_special_table)
/
sizeof(html_special_table[0]);
extern "C"
const char * html_special_table_lookup(const char * const name) {
// XXX: this should be a iterating-decreasing jump search
for (size_t i = 0; i < html_special_table_size; i++) {
if (!strcmp(name, html_special_table[i][0])) {
return html_special_table[i][1];
}
}
return NULL;
}
extern "C"
const char * html_special_to_utf8(const char * const special) {
static std::string r;
r = std::string(special);
trim(r);
uint32_t i(std::stoi(r)); // XXX: with no-exception this is suicide
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv1;
r = conv1.to_bytes(i);
return r.c_str();
}
extern "C"
const char * utf8_to_html_special(const char * const utf) {
for (size_t i = 0; i < html_special_table_size; i++) {
if (!strcmp(utf, html_special_table[i][2])) {
fflush(stdout);
return html_special_table[i][0];
}
}
return utf;
}

@ -1,273 +1,265 @@
#ifndef HTML_SPECIAL_HPP
#include <string.h>
#include <string>
#include <codecvt>
#include <locale>
#include "global.hpp"
const char * html_special_table[][2] =
extern "C"
const char * html_special_table_lookup(const char * const name);
extern "C"
const char * html_special_to_utf8(const char * const special);
extern "C"
const char * utf8_to_html_special(const char * const utf);
inline
const char * const html_special_table[][3] =
{
{"&Aacute;", "&#193;"},
{"&aacute;", "&#225;"},
{"&Acirc;", "&#194;"},
{"&acirc;", "&#226;"},
{"&acute;", "&#180;"},
{"&AElig;", "&#198;"},
{"&aelig;", "&#230;"},
{"&Agrave;", "&#192;"},
{"&agrave;", "&#224;"},
{"&Alpha;", "&#913;"},
{"&alpha;", "&#945;"},
{"&amp;", "&#38;"},
{"&and;", "&#8743;"},
{"&ang;", "&#8736;"},
{"&apos;", "&#39;"},
{"&Aring;", "&#197;"},
{"&aring;", "&#229;"},
{"&asymp;", "&#8776;"},
{"&Atilde;", "&#195;"},
{"&atilde;", "&#227;"},
{"&Auml;", "&#196;"},
{"&auml;", "&#228;"},
{"&bdquo;", "&#8222;"},
{"&Beta;", "&#914;"},
{"&beta;", "&#946;"},
{"&brvbar;", "&#166;"},
{"&bull;", "&#8226;"},
{"&cap;", "&#8745;"},
{"&Ccedil;", "&#199;"},
{"&ccedil;", "&#231;"},
{"&cedil;", "&#184;"},
{"&cent;", "&#162;"},
{"&Chi;", "&#935;"},
{"&chi;", "&#967;"},
{"&circ;", "&#710;"},
{"&clubs;", "&#9827;"},
{"&cong;", "&#8773;"},
{"&copy;", "&#169;"},
{"&crarr;", "&#8629;"},
{"&cup;", "&#8746;"},
{"&curren;", "&#164;"},
{"&dagger;", "&#8224;"},
{"&Dagger;", "&#8225;"},
{"&darr;", "&#8595;"},
{"&deg;", "&#176;"},
{"&Delta;", "&#916;"},
{"&delta;", "&#948;"},
{"&diams;", "&#9830;"},
{"&divide;", "&#247;"},
{"&Eacute;", "&#201;"},
{"&eacute;", "&#233;"},
{"&Ecirc;", "&#202;"},
{"&ecirc;", "&#234;"},
{"&Egrave;", "&#200;"},
{"&egrave;", "&#232;"},
{"&empty;", "&#8709;"},
{"&emsp;", "&#8195;"},
{"&ensp;", "&#8194;"},
{"&Epsilon;", "&#917;"},
{"&epsilon;", "&#949;"},
{"&equiv;", "&#8801;"},
{"&Eta;", "&#919;"},
{"&eta;", "&#951;"},
{"&ETH;", "&#208;"},
{"&eth;", "&#240;"},
{"&Euml;", "&#203;"},
{"&euml;", "&#235;"},
{"&euro;", "&#8364;"},
{"&exist;", "&#8707;"},
{"&fnof;", "&#402;"},
{"&forall;", "&#8704;"},
{"&frac12;", "&#189;"},
{"&frac14;", "&#188;"},
{"&frac34;", "&#190;"},
{"&Gamma;", "&#915;"},
{"&gamma;", "&#947;"},
{"&ge;", "&#8805;"},
{"&gt;", "&#62;"},
{"&harr;", "&#8596;"},
{"&hearts;", "&#9829;"},
{"&hellip;", "&#8230;"},
{"&Iacute;", "&#205;"},
{"&iacute;", "&#237;"},
{"&Icirc;", "&#206;"},
{"&icirc;", "&#238;"},
{"&iexcl;", "&#161;"},
{"&Igrave;", "&#204;"},
{"&igrave;", "&#236;"},
{"&infin;", "&#8734;"},
{"&int;", "&#8747;"},
{"&Iota;", "&#921;"},
{"&iota;", "&#953;"},
{"&iquest;", "&#191;"},
{"&isin;", "&#8712;"},
{"&Iuml;", "&#207;"},
{"&iuml;", "&#239;"},
{"&Kappa;", "&#922;"},
{"&kappa;", "&#954;"},
{"&Lambda;", "&#923;"},
{"&lambda;", "&#955;"},
{"&laquo;", "&#171;"},
{"&larr;", "&#8592;"},
{"&lceil;", "&#8968;"},
{"&ldquo;", "&#8220;"},
{"&le;", "&#8804;"},
{"&lfloor;", "&#8970;"},
{"&lowast;", "&#8727;"},
{"&loz;", "&#9674;"},
{"&lrm;", "&#8206;"},
{"&lsaquo;", "&#8249;"},
{"&lsquo;", "&#8216;"},
{"&lt;", "&#60;"},
{"&macr;", "&#175;"},
{"&mdash;", "&#8212;"},
{"&micro;", "&#181;"},
{"&middot;", "&#183;"},
{"&minus;", "&#8722;"},
{"&Mu;", "&#924;"},
{"&mu;", "&#956;"},
{"&nabla;", "&#8711;"},
{"&nbsp;", "&#160;"},
{"&ndash;", "&#8211;"},
{"&ne;", "&#8800;"},
{"&ni;", "&#8715;"},
{"&not;", "&#172;"},
{"&notin;", "&#8713;"},
{"&nsub;", "&#8836;"},
{"&Ntilde;", "&#209;"},
{"&ntilde;", "&#241;"},
{"&Nu;", "&#925;"},
{"&nu;", "&#957;"},
{"&Oacute;", "&#211;"},
{"&oacute;", "&#243;"},
{"&Ocirc;", "&#212;"},
{"&ocirc;", "&#244;"},
{"&OElig;", "&#338;"},
{"&oelig;", "&#339;"},
{"&Ograve;", "&#210;"},
{"&ograve;", "&#242;"},
{"&oline;", "&#8254;"},
{"&Omega;", "&#937;"},
{"&omega;", "&#969;"},
{"&Omicron;", "&#927;"},
{"&omicron;", "&#959;"},
{"&oplus;", "&#8853;"},
{"&or;", "&#8744;"},
{"&ordf;", "&#170;"},
{"&ordm;", "&#186;"},
{"&Oslash;", "&#216;"},
{"&oslash;", "&#248;"},
{"&Otilde;", "&#213;"},
{"&otilde;", "&#245;"},
{"&otimes;", "&#8855;"},
{"&Ouml;", "&#214;"},
{"&ouml;", "&#246;"},
{"&para;", "&#182;"},
{"&part;", "&#8706;"},
{"&permil;", "&#8240;"},
{"&perp;", "&#8869;"},
{"&Phi;", "&#934;"},
{"&phi;", "&#966;"},
{"&Pi;", "&#928;"},
{"&pi;", "&#960;"},
{"&piv;", "&#982;"},
{"&plusmn;", "&#177;"},
{"&pound;", "&#163;"},
{"&prime;", "&#8242;"},
{"&Prime;", "&#8243;"},
{"&prod;", "&#8719;"},
{"&prop;", "&#8733;"},
{"&Psi;", "&#936;"},
{"&psi;", "&#968;"},
{"&quot;", "&#34;"},
{"&radic;", "&#8730;"},
{"&raquo;", "&#187;"},
{"&rarr;", "&#8594;"},
{"&rceil;", "&#8969;"},
{"&rdquo;", "&#8221;"},
{"&reg;", "&#174;"},
{"&rfloor;", "&#8971;"},
{"&Rho;", "&#929;"},
{"&rho;", "&#961;"},
{"&rlm;", "&#8207;"},
{"&rsaquo;", "&#8250;"},
{"&rsquo;", "&#8217;"},
{"&sbquo;", "&#8218;"},
{"&Scaron;", "&#352;"},
{"&scaron;", "&#353;"},
{"&sdot;", "&#8901;"},
{"&sect;", "&#167;"},
{"&shy;", "&#173;"},
{"&Sigma;", "&#931;"},
{"&sigma;", "&#963;"},
{"&sigmaf;", "&#962;"},
{"&sim;", "&#8764;"},
{"&spades;", "&#9824;"},
{"&sub;", "&#8834;"},
{"&sube;", "&#8838;"},
{"&sum;", "&#8721;"},
{"&sup1;", "&#185;"},
{"&sup2;", "&#178;"},
{"&sup3;", "&#179;"},
{"&sup;", "&#8835;"},
{"&supe;", "&#8839;"},
{"&szlig;", "&#223;"},
{"&Tau;", "&#932;"},
{"&tau;", "&#964;"},
{"&there4;", "&#8756;"},
{"&Theta;", "&#920;"},
{"&theta;", "&#952;"},
{"&thetasym;", "&#977;"},
{"&thinsp;", "&#8201;"},
{"&THORN;", "&#222;"},
{"&thorn;", "&#254;"},
{"&tilde;", "&#732;"},
{"&times;", "&#215;"},
{"&trade;", "&#8482;"},
{"&Uacute;", "&#218;"},
{"&uacute;", "&#250;"},
{"&uarr;", "&#8593;"},
{"&Ucirc;", "&#219;"},
{"&ucirc;", "&#251;"},
{"&Ugrave;", "&#217;"},
{"&ugrave;", "&#249;"},
{"&uml;", "&#168;"},
{"&upsih;", "&#978;"},
{"&Upsilon;", "&#933;"},
{"&upsilon;", "&#965;"},
{"&Uuml;", "&#220;"},
{"&uuml;", "&#252;"},
{"&Xi;", "&#926;"},
{"&xi;", "&#958;"},
{"&Yacute;", "&#221;"},
{"&yacute;", "&#253;"},
{"&yen;", "&#165;"},
{"&yuml;", "&#255;"},
{"&Yuml;", "&#376;"},
{"&Zeta;", "&#918;"},
{"&zeta;", "&#950;"},
{"&zwj;", "&#8205;"},
{"&zwnj;", "&#8204;"},
{"&Aacute;", "&#193;", "Á"},
{"&aacute;", "&#225;", "á"},
{"&Acirc;", "&#194;", "Â"},
{"&acirc;", "&#226;", "â"},
{"&acute;", "&#180;", "´"},
{"&AElig;", "&#198;", "Æ"},
{"&aelig;", "&#230;", "æ"},
{"&Agrave;", "&#192;", "À"},
{"&agrave;", "&#224;", "à"},
{"&Alpha;", "&#913;", "Α"},
{"&alpha;", "&#945;", "α"},
{"&amp;", "&#38;", "&"},
{"&and;", "&#8743;", ""},
{"&ang;", "&#8736;", ""},
{"&apos;", "&#39;", "'"},
{"&Aring;", "&#197;", "Å"},
{"&aring;", "&#229;", "å"},
{"&asymp;", "&#8776;", ""},
{"&Atilde;", "&#195;", "Ã"},
{"&atilde;", "&#227;", "ã"},
{"&Auml;", "&#196;", "Ä"},
{"&auml;", "&#228;", "ä"},
{"&bdquo;", "&#8222;", ""},
{"&Beta;", "&#914;", "Β"},
{"&beta;", "&#946;", "β"},
{"&brvbar;", "&#166;", "¦"},
{"&bull;", "&#8226;", ""},
{"&cap;", "&#8745;", ""},
{"&Ccedil;", "&#199;", "Ç"},
{"&ccedil;", "&#231;", "ç"},
{"&cedil;", "&#184;", "¸"},
{"&cent;", "&#162;", "¢"},
{"&Chi;", "&#935;", "Χ"},
{"&chi;", "&#967;", "χ"},
{"&circ;", "&#710;", "ˆ"},
{"&clubs;", "&#9827;", ""},
{"&cong;", "&#8773;", ""},
{"&copy;", "&#169;", "©"},
{"&crarr;", "&#8629;", ""},
{"&cup;", "&#8746;", ""},
{"&curren;", "&#164;", "¤"},
{"&dagger;", "&#8224;", ""},
{"&Dagger;", "&#8225;", ""},
{"&darr;", "&#8595;", ""},
{"&deg;", "&#176;", "°"},
{"&Delta;", "&#916;", "Δ"},
{"&delta;", "&#948;", "δ"},
{"&diams;", "&#9830;", ""},
{"&divide;", "&#247;", "÷"},
{"&Eacute;", "&#201;", "É"},
{"&eacute;", "&#233;", "é"},
{"&Ecirc;", "&#202;", "Ê"},
{"&ecirc;", "&#234;", "ê"},
{"&Egrave;", "&#200;", "È"},
{"&egrave;", "&#232;", "è"},
{"&empty;", "&#8709;", ""},
{"&emsp;", "&#8195;", ""},
{"&ensp;", "&#8194;", ""},
{"&Epsilon;", "&#917;", "Ε"},
{"&epsilon;", "&#949;", "ε"},
{"&equiv;", "&#8801;", ""},
{"&Eta;", "&#919;", "Η"},
{"&eta;", "&#951;", "η"},
{"&ETH;", "&#208;", "Ð"},
{"&eth;", "&#240;", "ð"},
{"&Euml;", "&#203;", "Ë"},
{"&euml;", "&#235;", "ë"},
{"&euro;", "&#8364;", ""},
{"&exist;", "&#8707;", ""},
{"&fnof;", "&#402;", "ƒ"},
{"&forall;", "&#8704;", ""},
{"&frac12;", "&#189;", "½"},
{"&frac14;", "&#188;", "¼"},
{"&frac34;", "&#190;", "¾"},
{"&Gamma;", "&#915;", "Γ"},
{"&gamma;", "&#947;", "γ"},
{"&ge;", "&#8805;", ""},
{"&gt;", "&#62;", ">"},
{"&harr;", "&#8596;", ""},
{"&hearts;", "&#9829;", ""},
{"&hellip;", "&#8230;", ""},
{"&Iacute;", "&#205;", "Í"},
{"&iacute;", "&#237;", "í"},
{"&Icirc;", "&#206;", "Î"},
{"&icirc;", "&#238;", "î"},
{"&iexcl;", "&#161;", "¡"},
{"&Igrave;", "&#204;", "Ì"},
{"&igrave;", "&#236;", "ì"},
{"&infin;", "&#8734;", ""},
{"&int;", "&#8747;", ""},
{"&Iota;", "&#921;", "Ι"},
{"&iota;", "&#953;", "ι"},
{"&iquest;", "&#191;", "¿"},
{"&isin;", "&#8712;", ""},
{"&Iuml;", "&#207;", "Ï"},
{"&iuml;", "&#239;", "ï"},
{"&Kappa;", "&#922;", "Κ"},
{"&kappa;", "&#954;", "κ"},
{"&Lambda;", "&#923;", "Λ"},
{"&lambda;", "&#955;", "λ"},
{"&laquo;", "&#171;", "«"},
{"&larr;", "&#8592;", ""},
{"&lceil;", "&#8968;", ""},
{"&ldquo;", "&#8220;", ""},
{"&le;", "&#8804;", ""},
{"&lfloor;", "&#8970;", ""},
{"&lowast;", "&#8727;", ""},
{"&loz;", "&#9674;", ""},
{"&lrm;", "&#8206;", ""},
{"&lsaquo;", "&#8249;", ""},
{"&lsquo;", "&#8216;", ""},
{"&lt;", "&#60;", "<"},
{"&macr;", "&#175;", "¯"},
{"&mdash;", "&#8212;", ""},
{"&micro;", "&#181;", "µ"},
{"&middot;", "&#183;", "·"},
{"&minus;", "&#8722;", ""},
{"&Mu;", "&#924;", "Μ"},
{"&mu;", "&#956;", "μ"},
{"&nabla;", "&#8711;", ""},
{"&nbsp;", "&#160;", " "},
{"&ndash;", "&#8211;", ""},
{"&ne;", "&#8800;", ""},
{"&ni;", "&#8715;", ""},
{"&not;", "&#172;", "¬"},
{"&notin;", "&#8713;", ""},
{"&nsub;", "&#8836;", ""},
{"&Ntilde;", "&#209;", "Ñ"},
{"&ntilde;", "&#241;", "ñ"},
{"&Nu;", "&#925;", "Ν"},
{"&nu;", "&#957;", "ν"},
{"&Oacute;", "&#211;", "Ó"},
{"&oacute;", "&#243;", "ó"},
{"&Ocirc;", "&#212;", "Ô"},
{"&ocirc;", "&#244;", "ô"},
{"&OElig;", "&#338;", "Œ"},
{"&oelig;", "&#339;", "œ"},
{"&Ograve;", "&#210;", "Ò"},
{"&ograve;", "&#242;", "ò"},
{"&oline;", "&#8254;", ""},
{"&Omega;", "&#937;", "Ω"},
{"&omega;", "&#969;", "ω"},
{"&Omicron;", "&#927;", "Ο"},
{"&omicron;", "&#959;", "ο"},
{"&oplus;", "&#8853;", ""},
{"&or;", "&#8744;", ""},
{"&ordf;", "&#170;", "ª"},
{"&ordm;", "&#186;", "º"},
{"&Oslash;", "&#216;", "Ø"},
{"&oslash;", "&#248;", "ø"},
{"&Otilde;", "&#213;", "Õ"},
{"&otilde;", "&#245;", "õ"},
{"&otimes;", "&#8855;", ""},
{"&Ouml;", "&#214;", "Ö"},
{"&ouml;", "&#246;", "ö"},
{"&para;", "&#182;", ""},
{"&part;", "&#8706;", ""},
{"&permil;", "&#8240;", ""},
{"&perp;", "&#8869;", ""},
{"&Phi;", "&#934;", "Φ"},
{"&phi;", "&#966;", "φ"},
{"&Pi;", "&#928;", "Π"},
{"&pi;", "&#960;", "π"},
{"&piv;", "&#982;", "ϖ"},
{"&plusmn;", "&#177;", "±"},
{"&pound;", "&#163;", "£"},
{"&prime;", "&#8242;", ""},
{"&Prime;", "&#8243;", ""},
{"&prod;", "&#8719;", ""},
{"&prop;", "&#8733;", ""},
{"&Psi;", "&#936;", "Ψ"},
{"&psi;", "&#968;", "ψ"},
{"&quot;", "&#34;", "\""},
{"&radic;", "&#8730;", ""},
{"&raquo;", "&#187;", "»"},
{"&rarr;", "&#8594;", ""},
{"&rceil;", "&#8969;", ""},
{"&rdquo;", "&#8221;", ""},
{"&reg;", "&#174;", "®"},
{"&rfloor;", "&#8971;", ""},
{"&Rho;", "&#929;", "Ρ"},
{"&rho;", "&#961;", "ρ"},
{"&rlm;", "&#8207;", ""},
{"&rsaquo;", "&#8250;", ""},
{"&rsquo;", "&#8217;", ""},
{"&sbquo;", "&#8218;", ""},
{"&Scaron;", "&#352;", "Š"},
{"&scaron;", "&#353;", "š"},
{"&sdot;", "&#8901;", ""},
{"&sect;", "&#167;", "§"},
{"&shy;", "&#173;", "­"},
{"&Sigma;", "&#931;", "Σ"},
{"&sigma;", "&#963;", "σ"},
{"&sigmaf;", "&#962;", "ς"},
{"&sim;", "&#8764;", ""},
{"&spades;", "&#9824;", ""},
{"&sub;", "&#8834;", ""},
{"&sube;", "&#8838;", ""},
{"&sum;", "&#8721;", ""},
{"&sup1;", "&#185;", "¹"},
{"&sup2;", "&#178;", "²"},
{"&sup3;", "&#179;", "³"},
{"&sup;", "&#8835;", ""},
{"&supe;", "&#8839;", ""},
{"&szlig;", "&#223;", "ß"},
{"&Tau;", "&#932;", "Τ"},
{"&tau;", "&#964;", "τ"},
{"&there4;", "&#8756;", ""},
{"&Theta;", "&#920;", "Θ"},
{"&theta;", "&#952;", "θ"},
{"&thetasym;", "&#977;", "ϑ"},
{"&thinsp;", "&#8201;", ""},
{"&THORN;", "&#222;", "Þ"},
{"&thorn;", "&#254;", "þ"},
{"&tilde;", "&#732;", "˜"},
{"&times;", "&#215;", "×"},
{"&trade;", "&#8482;", ""},
{"&Uacute;", "&#218;", "Ú"},
{"&uacute;", "&#250;", "ú"},
{"&uarr;", "&#8593;", ""},
{"&Ucirc;", "&#219;", "Û"},
{"&ucirc;", "&#251;", "û"},
{"&Ugrave;", "&#217;", "Ù"},
{"&ugrave;", "&#249;", "ù"},
{"&uml;", "&#168;", "¨"},
{"&upsih;", "&#978;", "ϒ"},
{"&Upsilon;", "&#933;", "Υ"},
{"&upsilon;", "&#965;", "υ"},
{"&Uuml;", "&#220;", "Ü"},
{"&uuml;", "&#252;", "ü"},
{"&Xi;", "&#926;", "Ξ"},
{"&xi;", "&#958;", "ξ"},
{"&Yacute;", "&#221;", "Ý"},
{"&yacute;", "&#253;", "ý"},
{"&yen;", "&#165;", "¥"},
{"&yuml;", "&#255;", "ÿ"},
{"&Yuml;", "&#376;", "Ÿ"},
{"&Zeta;", "&#918;", "Ζ"},
{"&zeta;", "&#950;", "ζ"},
{"&zwj;", "&#8205;", ""},
{"&zwnj;", "&#8204;", ""},
};
const size_t html_special_table_size = sizeof(html_special_table) / sizeof(html_special_table[0]);
const char * html_special_table_lookup(const char * const name) {
// this should be a iterating-decreasing jump search
for (int i = 0; i < html_special_table_size; i++) {
if (!strcmp(name, html_special_table[i][0])) {
return html_special_table[i][1];
}
}
return NULL;
}
const char * html_special_to_utf8(const char * const special) {
static std::string r(special);
trim(r);
uint32_t i(std::stoi(r));
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv1;
r = conv1.to_bytes(i);
return r.c_str();
}
#define HTML_SPECIAL_HPP
#endif

@ -8,6 +8,7 @@
#include <map>
#include "cli.hpp"
#include "scanner.hpp"
#include "exit_values.hpp"
#define DECLARE_LEXER(x) \
@ -18,10 +19,6 @@
DECLARE_LEXER(csml);
DECLARE_LEXER(xml);
std::vector<std::string> ignore_list;
int ignore_count = 1;
int ignore_i = 1;
const char DEFAULT_QUOTE = '\'';
char quote = DEFAULT_QUOTE;
char * output = NULL;

@ -4,11 +4,12 @@
#include <string>
#include <algorithm>
#define ECHOS(s) { \
#define ECHOS(s) do { \
const char * const ss = s; \
fwrite(ss, strlen(ss), sizeof(char), yyout); \
} while (0)
#define ECHOC(c) fputc(c, yyout)
#define BUFFER(s) buffer += s
extern std::vector<std::string> ignore_list;
@ -22,6 +23,7 @@ bool do_ignore(const std::string &current_tag) {
extern int ignore_count; // number of '{' / '}'s to be placed around the current ignored block
extern int ignore_i; // number of '}'s so far
extern std::string buffer;
#define SCANNER_H
#endif

@ -6,8 +6,7 @@
#include "global.hpp"
#include "html_special.hpp"
bool is_comment_multiline = false;
unsigned long long comment_begining;
bool is_comment_multiline;
std::string current_tag;
long ignore_start;
@ -34,8 +33,9 @@ identifier [A-z][A-z0-9]*
BEGIN TAG_START;
}
\<\!-- {
comment_begining = cursor_position;
ECHOS("//");
is_comment_multiline = false;
buffer = std::string("");
BUFFER("//");
BEGIN COMMENT;
}
&[A-z]+; {
@ -51,18 +51,18 @@ identifier [A-z][A-z0-9]*
<COMMENT>{
. {
ECHO;
BUFFER(yytext);
}
\n {
BUFFER(yytext);
is_comment_multiline = true;
}
--\> {
if (is_comment_multiline) {
auto buffer = ftell(yyin);
fseek(yyin, comment_begining+1, SEEK_SET);
fputc('*', yyin);
fseek(yyin, buffer, SEEK_SET);
buffer[1] = '*';
buffer += "*/";
}
ECHOS(buffer.c_str());
BEGIN INITIAL;
}
}
@ -146,7 +146,7 @@ identifier [A-z][A-z0-9]*
dup = strdup(yytext);
trim(dup);
if (!strcmp(dup, current_tag.c_str())) {
for (int i = -1; i < ignore_count; i++) {
for (int i = 0; i < ignore_count; i++) {
ECHOC('{');
}
fseek(yyin, ignore_start, SEEK_SET);
@ -160,7 +160,7 @@ identifier [A-z][A-z0-9]*
\{ {
BEGIN IGNORE_COUNT_START;
}
\{ {
\} {
BEGIN IGNORE_COUNT_END;
}
.|\n {

@ -13,7 +13,7 @@ html {
body {
hr;
div (class: myclass) {
lorem > ipsum
lorem > ipsum ­
}
}
}

@ -1,6 +1,9 @@
<!-- DOCTYPE HTML -->
<html>
<head>
<!-- for some reason,
i feel like commenting styles
-->
<style>
div {
color: red;
@ -10,7 +13,7 @@
<body>
<hr/>
<div class='myclass'>
lorem &gt; ipsum
lorem &gt; ipsum &shy;
</div>
</body>
</html>