recovered html -> csml; made it more strict with success

This commit is contained in:
anon
2024-02-21 17:23:33 +01:00
parent 9d269d3c59
commit 4a1385d9ce
7 changed files with 113 additions and 121 deletions

View File

@ -8,8 +8,7 @@
#include "exit_values.hpp"
#include "scanner.hpp"
extern std::stack<std::string> csml_tag_stack;
extern unsigned xml_tag_stack;
extern std::stack<std::string> tag_stack;
const char * const csml_extension = ".csml";
@ -138,13 +137,13 @@ signed parse_round2_arguments(int argc, char * * argv) {
switch (input_type) {
case input_type_t::CSML: {
yylex(csml_in, csml_out, csml_lex);
if (not csml_tag_stack.empty()) {
if (not tag_stack.empty()) {
exit(POLUTED_STACK);
}
} break;
case input_type_t::XML: {
yylex(xml_in, xml_out, xml_lex);
if(xml_tag_stack) {
if (not tag_stack.empty()) {
exit(POLUTED_STACK);
}
} break;

View File

@ -2,16 +2,20 @@
#include "scanner.hpp"
#include <stdio.h>
#include <stack>
#include <string>
#include "exit_values.hpp"
#include "html_special.hpp"
#include "global.hpp"
std::stack<std::string> csml_tag_stack;
/* number of '}'s to recognize as the end
* of the current ignired block
*/
static int ignore_count = 1;
static std::string tag_candidate = "";
/* number of '}' so far; used for maximum search, updating ignore_count
*/
static int ignore_i = 1;
static
void _ECHO_CANDIDATE(){
@ -38,17 +42,17 @@ void push_tag() {
}
trim(tag_candidate);
csml_tag_stack.push(tag_candidate);
tag_stack.push(tag_candidate);
tag_candidate = "";
}
static
void pop_tag() {
if (csml_tag_stack.empty()) {
if (tag_stack.empty()) {
exit(TAG_NOT_FOUND);
}
csml_tag_stack.pop();
tag_stack.pop();
tag_candidate = "";
}
@ -90,7 +94,7 @@ unicode [\300-\364]
}
\( {
push_tag();
ECHOS(("<" + csml_tag_stack.top() + " ").c_str());
ECHOS(("<" + tag_stack.top() + " ").c_str());
BEGIN HEAD;
}
&#?{identifier}; {
@ -103,12 +107,12 @@ unicode [\300-\364]
}
\{ {
push_tag();
ECHOS(("<" + csml_tag_stack.top() + ">").c_str());
ECHOS(("<" + tag_stack.top() + ">").c_str());
BEGIN IGNORE_COUNT_START;
}
\} {
ECHO_CANDIDATE;
ECHOS(("</" + csml_tag_stack.top() + ">").c_str());
ECHOS(("</" + tag_stack.top() + ">").c_str());
pop_tag();
}
\#\! {
@ -238,7 +242,7 @@ unicode [\300-\364]
ignore_count = 1;
ECHOS(buffer.c_str());
ECHOS(("</" + csml_tag_stack.top() + ">").c_str());
ECHOS(("</" + tag_stack.top() + ">").c_str());
pop_tag();
BEGIN BODY;
}
@ -262,7 +266,7 @@ unicode [\300-\364]
BEGIN IGNORE_COUNT_END;
} else {
ECHOS(buffer.c_str());
ECHOS(("</" + csml_tag_stack.top() + ">").c_str());
ECHOS(("</" + tag_stack.top() + ">").c_str());
pop_tag();
BEGIN BODY;
}

View File

@ -7,4 +7,5 @@ enum {
TAG_NOT_NAMED = 5,
TAG_NOT_FOUND = 6,
UNRECOGNIZED_TAG = 7,
ASSIMETRY = 8,
};

View File

@ -15,10 +15,10 @@ std::vector<std::string> asymmetric_special_list;
bool is_asymmetric;
int ignore_count = 1;
int ignore_i = 1;
std::string buffer;
std::string tag_candidate;
std::stack<std::string> tag_stack;
extern int xml_lex_destroy(void);
extern int csml_lex_destroy(void);

View File

@ -3,6 +3,7 @@
#include <vector>
#include <string>
#include <algorithm>
#include <stack>
#define DECLARE_LEXER(x) \
extern FILE * x ## _in; \
@ -26,19 +27,15 @@ extern bool is_asymmetric;
extern char quote;
/* number of '{' & '}'s to be placed around the current ignored block
*/
extern int ignore_count;
/* number of '}'s so far
*/
extern int ignore_i;
/* used for saving sections whichs starting projection
* cannot be determined before reading the while
* (e.g. comments (single- or multiline?))
*/
extern std::string buffer;
extern std::stack<std::string> tag_stack;
extern std::string tag_candidate;
#define SCANNER_H
#endif

View File

@ -7,10 +7,7 @@
#include "html_special.hpp"
#include "exit_values.hpp"
unsigned xml_tag_stack = 0;
static bool is_comment_multiline;
static std::string current_tag;
static char current_string_quote;
int state_buffer;
@ -18,20 +15,13 @@ int state_buffer;
unsigned long long cursor_position = 0;
#define YY_USER_ACTION cursor_position += yyleng;
inline
static
void xml_tag_stack_push() {
++xml_tag_stack;
}
/* number of '{' & '}'s to be placed around the block contents
*/
static int extra_brace_count = 0;
inline
static
void xml_tag_stack_pop() {
if (!xml_tag_stack) {
exit(POLUTED_STACK);
}
--xml_tag_stack;
}
/* number of '{' | '}'s so far; used for maximum search, updating extra_brace_count
*/
static int extra_brace_i = 0;
%}
%option noyywrap
@ -43,7 +33,7 @@ void xml_tag_stack_pop() {
%x COMMENT
%x DECLARATION
%x STRING
%x IGNORE IGNORE_COUNT_START IGNORE_COUNT_END
%x BRACE_COUNT_START BRACE_COUNT_END
ws [ \t\r\v\f]
wsnl [ \t\r\v\f\n]
@ -52,11 +42,15 @@ identifier [A-z][A-z0-9]*
%%
<INITIAL>{
\< {
ECHOS(buffer.c_str());
buffer = "";
BEGIN TAG_START;
}
\<\!-- {
ECHOS(buffer.c_str());
buffer = "";
is_comment_multiline = false;
buffer = std::string("");
buffer = "";
BUFFER("//");
BEGIN COMMENT;
}
@ -65,21 +59,65 @@ identifier [A-z][A-z0-9]*
BEGIN DECLARATION;
}
&[A-z]+; {
ECHOS(html_special_to_utf8(html_special_table_lookup(yytext)));
BUFFER(html_special_to_utf8(html_special_table_lookup(yytext)));
}
&[0-9]+; {
ECHOS(html_special_to_utf8(yytext));
BUFFER(html_special_to_utf8(yytext));
}
[(){};] {
ECHOC('\\');
ECHOC(yytext[0]);
/*
[(){};] {
ECHOC('\\');
ECHOC(yytext[0]);
}
*/
\<\/{wsnl}*{identifier}+{wsnl}*\> {
if (tag_stack.empty()) {
exit(POLUTED_STACK);
}
char * dup = strdup(yytext);
trim(dup);
bool eq = (tag_stack.top() == dup);
free(dup);
if (not eq) {
exit(ASSIMETRY);
}
for (int i = 0; i < extra_brace_count; i++) {
ECHOC('{');
}
ECHOS(buffer.c_str());
for (int i = -1; i < extra_brace_count; i++) {
ECHOC('}');
}
buffer = "";
extra_brace_count = 0;
tag_stack.pop();
}
\{ {
BUFFER(yytext);
BEGIN BRACE_COUNT_START;
}
\} {
BUFFER(yytext);
BEGIN BRACE_COUNT_END;
}
.|\n {
ECHO;
BUFFER(yytext);
}
}
<COMMENT>{
--\> {
if (is_comment_multiline) {
buffer[1] = '*';
buffer += "*/";
}
ECHOS(buffer.c_str());
buffer = "";
BEGIN INITIAL;
}
. {
BUFFER(yytext);
}
@ -87,14 +125,6 @@ identifier [A-z][A-z0-9]*
BUFFER(yytext);
is_comment_multiline = true;
}
--\> {
if (is_comment_multiline) {
buffer[1] = '*';
buffer += "*/";
}
ECHOS(buffer.c_str());
BEGIN INITIAL;
}
}
<DECLARATION>{
@ -114,14 +144,9 @@ identifier [A-z][A-z0-9]*
}
<TAG_START>{
\/{identifier}+{wsnl}*\> {
xml_tag_stack_pop();
ECHOC('}');
BEGIN INITIAL;
}
{identifier}+ {
ECHO;
current_tag = yytext;
tag_stack.emplace(yytext);
BEGIN TAG_MAYBE;
}
. {
@ -141,12 +166,12 @@ identifier [A-z][A-z0-9]*
<TAG_MAYBE>{
\> {
xml_tag_stack_push();
ECHOS(" {");
BEGIN IGNORE;
BEGIN INITIAL;
}
\/\> {
ECHOC(';');
tag_stack.pop();
BEGIN INITIAL;
}
{wsnl} {
@ -169,12 +194,12 @@ identifier [A-z][A-z0-9]*
ECHOS(": ");
}
\> {
xml_tag_stack_push();
ECHOS(") {");
BEGIN IGNORE;
BEGIN INITIAL;
}
\/\> {
ECHOS(");");
tag_stack.pop();
BEGIN INITIAL;
}
{ws} {
@ -186,7 +211,7 @@ identifier [A-z][A-z0-9]*
}
<TAG_ASYMETRIC_SPECIAL>{
.\> {
.\> {
ECHO;
is_asymmetric = std::find(asymmetric_special_list.begin(),
asymmetric_special_list.end(),
@ -231,64 +256,26 @@ identifier [A-z][A-z0-9]*
}
}
<IGNORE>{
\<\/{identifier}+\> {
char * dup;
dup = strdup(yytext);
trim(dup);
const int eq = !strcmp(dup, current_tag.c_str());
free(dup);
if (eq) {
for (int i = 0; i < ignore_count; i++) {
ECHOC('{');
}
ECHOS(buffer.c_str());
for (int i = -1; i < ignore_count; i++) {
ECHOC('}');
}
ignore_count = 1;
xml_tag_stack_pop();
BEGIN INITIAL;
} else {
BUFFER(yytext);
ECHO;
}
}
<BRACE_COUNT_START>{
\{ {
BUFFER(yytext);
BEGIN IGNORE_COUNT_START;
++extra_brace_i;
}
}
<BRACE_COUNT_END>{
\} {
BUFFER(yytext);
BEGIN IGNORE_COUNT_END;
++extra_brace_i;
}
}
<BRACE_COUNT_START,BRACE_COUNT_END>{
.|\n {
BUFFER(yytext);
}
}
<IGNORE_COUNT_START>{
\{ {
BUFFER(yytext);
++ignore_i;
}
}
<IGNORE_COUNT_END>{
\} {
BUFFER(yytext);
++ignore_i;
}
}
<IGNORE_COUNT_START,IGNORE_COUNT_END>{
.|\n {
BUFFER(yytext);
if (ignore_i > ignore_count) {
ignore_count = ignore_i;
yyless(0);
if (extra_brace_i > extra_brace_count) {
extra_brace_count = extra_brace_i;
}
ignore_i = 0;
BEGIN IGNORE;
extra_brace_i = 0;
BEGIN INITIAL;
}
}
%%