Patch: Elm ME+ 2.5 PLalpha62 -> Elm ME+ 2.5 PLalpha63 [4/7] (3/7)
From
Kari Hurtta@21:1/5 to
All on Wed Jan 3 13:52:11 2024
[continued from previous message]
+ free_name_entity_match(& (token->match_reference));
+ token->numeric_reference = UNICODE_BAD_CHAR; /* Numeric reference */ + if (token->doctype_item) /* DOCTYPE information splitten */
+ free_string(&(token->doctype_item));
+
+
+ do {
+ /* NOTE: It is assumed that CRLF are same than on ascii */
+ int ch = EOF;
+
+ if (state_ready(token->rchar)) {
+
+ /* Have one chararacter collected */
+
+ uint16 u = give_unicode_from_state(token->rchar);
+
+ len = string_len(token->sbuffer);
+
+ DPRINT(Debug,22,(&Debug,
+ "get_new_tagfilter_token: u=%04u",
+ u));
+
+ if (state_printable(token->rchar)) {
+ DPRINT(Debug,22,(&Debug," '%C'",
+ token->rchar));
+ }
+ DPRINT(Debug,22,(&Debug," tag_state=%d",
+ token->tag_state));
+ switch (token->tag_state) {
+ case ts_init: DPRINT(Debug,22,(&Debug," ts_init")); break;
+ case ts_tag_start: DPRINT(Debug,22,(&Debug," ts_tag_start")); break;
+ case ts_tag_bang: DPRINT(Debug,22,(&Debug," ts_tag_bang")); break;
+ case ts_tag_endmark: DPRINT(Debug,22,(&Debug," ts_tag_endmark")); break;
+ case ts_tag_params: DPRINT(Debug,22,(&Debug," ts_tag_params")); break;
+ case ts_tag_ending: DPRINT(Debug,22,(&Debug," ts_tag_ending")); break;
+ case ts_tag_bogus_comment: DPRINT(Debug,22,(&Debug," ts_tag_bogus_comment"));
+ break;
+ case ts_tag_self_closing: DPRINT(Debug,22,(&Debug," ts_tag_self_closing"));
+ break;
+ case ts_tag_comment_start: DPRINT(Debug,22,(&Debug," ts_tag_comment_start"));
+ break;
+ case ts_tag_comment: DPRINT(Debug,22,(&Debug," ts_tag_comment")); break;
+ case ts_tag_after_atrname: DPRINT(Debug,22,(&Debug," ts_tag_after_atrname"));
+ break;
+ case ts_tag_atrvalue: DPRINT(Debug,22,(&Debug," ts_tag_atrvalue"));
+ break;
+ case ts_tag_after_quoted: DPRINT(Debug,22,(&Debug," ts_tag_after_quoted"));
+ break;
+ case ts_tag_doctype_start: DPRINT(Debug,22,(&Debug," ts_tag_doctype_start"));
+ break;
+ case ts_tag_doctype_bogus: DPRINT(Debug,22,(&Debug," ts_tag_doctype_bogus"));
+ break;
+ }
+ DPRINT(Debug,22,(&Debug," entity_state=%d",token->entity_state));
+ switch (token->entity_state) {
+ case ent_init: DPRINT(Debug,22,(&Debug," ent_init")); break;
+ case ent_entity_start: DPRINT(Debug,22,(&Debug," ent_entity_start")); break;
+ case ent_decimal_ent: DPRINT(Debug,22,(&Debug," ent_decimal_ent")); break;
+ case ent_hexdec_ent: DPRINT(Debug,22,(&Debug," ent_hexdec_ent")); break;
+ }
+ DPRINT(Debug,22,(&Debug," tag_flags=%d",token->tag_flags));
+ if (ison(token->tag_flags, TFLAG_seen_nl)) {
+ DPRINT(Debug,22,(&Debug," TFLAG_seen_nl"));
+ }
+ if (ison(token->tag_flags, TFLAG_self_closing)) {
+ DPRINT(Debug,22,(&Debug," TFLAG_self_closing"));
+ }
+ if (ison(token->tag_flags, TFLAG_seen_equals)) {
+ DPRINT(Debug,22,(&Debug," TFLAG_seen_equals"));
+ }
+ if (ison(token->tag_flags, TFLAG_num_overflow)) {
+ DPRINT(Debug,22,(&Debug," TFLAG_num_overflow"));
+ }
+ if (ison(token->tag_flags, TFLAG_unhandled_class)) {
+ DPRINT(Debug,22,(&Debug," TFLAG_unhandled_class"));
+ }
+ DPRINT(Debug,22,(&Debug," tag_quote=%d\n",
+ token->tag_quote));
+
+ if (0x000D /* CR '\r' */ == u) {
+ int peekch = state_getc(state_in);
+
+ if ('\n' == peekch) { /* Got newline */
+ ch = peekch;
+ reset_state(token->rchar,0);
+ goto process_char;
+
+ } else if (EOF != peekch)
+ state_ungetc(peekch,state_in);
+
+ if (ison(tagfilter->tagflt_mode,TAGFLT_MODE_convert_cr))
+ u = 0x000A /* LF '\n' */;
+ }
+
+ #define CHECK_SBUFFER_LEN(error_class) do { \
+ len = string_len(token->sbuffer); \
+ if (len >= MAX_TAG_TOKEN) { \
+ DPRINT(Debug,20,(&Debug, \
+ "get_new_tagfilter_token: Read %d characters\n", \
+ len)); \
+ ret = len; \
+ token->token_class = error_class; \
+ goto out; \
+ } \
+ } while(0)
+
+ #define ADD_SBUFFER_CHAR(error_class) do { \
+ add_state_to_string(token->sbuffer,token->rchar); /* Returns no status */ \
+ reset_state(token->rchar,0); /* Get next character */ \
+ CHECK_SBUFFER_LEN(error_class); \
+ } while(0)
+
+ #define ADD_SBUFFER_CHAR_NOCHECK do { \
+ add_state_to_string(token->sbuffer,token->rchar); /* Returns no status */ \
+ reset_state(token->rchar,0); /* Get next character */ \
+ len = string_len(token->sbuffer); \
+ } while (0)
+
+ #define EMIT_TOKEN_CLASS(class) do { \
+ ret = len; \
+ token->token_class = class; \
+ goto out; \
+ } while (0)
+
+ if (ison(token->tag_flags,TFLAG_seen_nl)) {
+
+ /* Start newline span */
+
+ if (0x000A /* LF '\n' */ == u) {
+
+ ADD_SBUFFER_CHAR(tf_span_nl);
+
+ } else {
+
+ clearit(token->tag_flags, TFLAG_seen_nl);
+
+ if (len > 0) {
+ /* got newline span */
+
+ EMIT_TOKEN_CLASS(tf_span_nl);
+ }
+ }
+
+
+ } else if (ts_tag_doctype_bogus == token->tag_state) {
+ /* Bogus DOCTYPE line */
+
+ if (token->tag_lookahead)
+ free_string(& (token->tag_lookahead));
+
+ if (0x003E /* > */ == u) {
+
+ token->tag_state = ts_init; /* Tag ended */
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ EMIT_TOKEN_CLASS(tf_doctype_end);
+
+ } else if ( 0x000A /* LF '\n' (new line) */ == u) {
+ /* Consume newline */
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ EMIT_TOKEN_CLASS(tf_doctype_segment);
+
+ } else if (0x0000 /* NUL */ == u) {
+ uint16 bad_char = UNICODE_BAD_CHAR;
+
+ DPRINT(Debug,20,(&Debug,
+ "get_new_tagfilter_token: Found NUL character when parsing doctype line\n"));
+
+ reset_state(token->rchar,0); /* Get next character */
+ token->error = 1;
+ add_unicode_to_string(token->sbuffer,1,&bad_char);
+
+ CHECK_SBUFFER_LEN(tf_doctype_segment);
+ } else {
+ ADD_SBUFFER_CHAR(tf_doctype_segment);
+ }
+
+ } else if (ts_tag_doctype_start == token->tag_state) {
+
+ int lookahead_len = 0;
+
+ /* Seen <!DOCTYPE */
+
+ if (token->tag_lookahead)
+ lookahead_len = string_len(token->tag_lookahead);
+
+ /* Not correct .... */
+
+ if (0x003E /* > */ == u) {
+
+ if (token->tag_quote) {
+
+ /* If > inside of quoted value, treate as error */
+
+ if (token->tag_lookahead)
+ free_string(& (token->tag_lookahead));
+
+ token->tag_state = ts_tag_doctype_bogus;
+ EMIT_TOKEN_CLASS(tf_doctype_error);
+ }
+
+ if (token->tag_lookahead) {
+ token->doctype_item = token->tag_lookahead;
+ token->tag_lookahead = NULL;
+ EMIT_TOKEN_CLASS(tf_doctype_item);
+
+ }
+
+ token->tag_state = ts_init; /* Tag ended */
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ EMIT_TOKEN_CLASS(tf_doctype_end);
+
+ } else if (0x000A /* LF '\n' (new line) */ == u) {
+
+
+ if (token->tag_quote && token->tag_lookahead) {
+ add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
+ }
+
+ /* Consume newline */
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ if (token->tag_quote && token->tag_lookahead) {
+
+ lookahead_len = string_len(token->tag_lookahead);
+ if (lookahead_len > MAX_DOCTYPE_VALUE) {
+
+ token->tag_state = ts_tag_doctype_bogus;
+ /* Forget it */
+
+ free_string(& (token->tag_lookahead));
+ EMIT_TOKEN_CLASS(tf_doctype_error);
+ }
+
+ EMIT_TOKEN_CLASS(tf_doctype_segment);
+
+ } else if (token->tag_lookahead) {
+
+ token->doctype_item = token->tag_lookahead;
+ token->tag_lookahead = NULL;
+ EMIT_TOKEN_CLASS(tf_doctype_item);
+
+ } else {
+ EMIT_TOKEN_CLASS(tf_doctype_space);
+ }
+
+ } else if (0x0000 /* NUL */ == u) {
+ uint16 bad_char = UNICODE_BAD_CHAR;
+
+ if (!token->tag_lookahead) {
+ len = string_len(token->sbuffer);
+ if (len > 0) {
+ DPRINT(Debug,20,(&Debug,
+ "get_new_tagfilter_token: Found NUL character when parsing doctype line - flushing buffer before it\n"));
+ EMIT_TOKEN_CLASS(tf_doctype_space);
+ }
+ }
+
+ DPRINT(Debug,20,(&Debug,
+ "get_new_tagfilter_token: Found NUL character when parsing doctype line\n"));
+
+ reset_state(token->rchar,0); /* Get next character */
+ token->error = 1;
+
+ add_unicode_to_string(token->sbuffer,1,&bad_char);
+
+ if (!token->tag_lookahead)
+ token->tag_lookahead = new_string(token->text_charset);
+
+ add_unicode_to_string(token->tag_lookahead,1,&bad_char);
+ lookahead_len = string_len(token->tag_lookahead);
+ if (lookahead_len > MAX_DOCTYPE_VALUE) {
+
+ token->tag_state = ts_tag_doctype_bogus;
+ /* Forget it */
+
+ free_string(& (token->tag_lookahead));
+ EMIT_TOKEN_CLASS(tf_doctype_error);
+ }
+
+ } else if (token->tag_quote && token->tag_lookahead) {
+
+ /* Does not detect if there is no space after quote */
+
+ if (token->tag_quote == u) {
+ token->tag_quote = 0;
+ add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ token->doctype_item = token->tag_lookahead;
+ token->tag_lookahead = NULL;
+ EMIT_TOKEN_CLASS(tf_doctype_item);
+ } else {
+
+ add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
+ lookahead_len = string_len(token->tag_lookahead);
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ if (lookahead_len > MAX_DOCTYPE_VALUE) {
+
+ token->tag_state = ts_tag_doctype_bogus;
+ /* Forget it */
+
+ free_string(& (token->tag_lookahead));
+ EMIT_TOKEN_CLASS(tf_doctype_error);
+ }
+
+ CHECK_SBUFFER_LEN(tf_doctype_segment);
+ }
+
+ } else {
+ if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
+ 0x000C /* FF '\f' (form feed) */ == u ||
+ 0x0020 /* SPACE */ == u) {
+
+ if (token->tag_lookahead) {
+ token->doctype_item = token->tag_lookahead;
+ token->tag_lookahead = NULL;
+ EMIT_TOKEN_CLASS(tf_doctype_item);
+
+ }
+
+ ADD_SBUFFER_CHAR(tf_doctype_space);
+
+ } else if (0x0022 /* " */ == u ||
+ 0x0027 /* ' */ == u) {
+
+ /* Not correct parsing because this does not check
+ was that part what was required to be quoted
+ */
+
+ if (token->tag_lookahead) {
+
+ token->tag_state = ts_tag_doctype_bogus;
+ /* Forget it */
+
+ free_string(& (token->tag_lookahead));
+ EMIT_TOKEN_CLASS(tf_doctype_error);
+
+ } else {
+
+ len = string_len(token->sbuffer);
+ if (len > 0) {
+ DPRINT(Debug,20,(&Debug,
+ "get_new_tagfilter_token: Flushing buffer before quoted item when parsing doctype line\n"));
+ EMIT_TOKEN_CLASS(tf_doctype_space);
+ }
+
+ token->tag_lookahead = new_string(token->text_charset);
+
+ add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
+ token->tag_quote = u;
+ }
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ } else {
+
+ /* Enable buffer */
+
+ if (!token->tag_lookahead) {
+ len = string_len(token->sbuffer);
+ if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_doctype_space);
+ }
+
+ token->tag_lookahead = new_string(token->text_charset);
+ }
+
+ /* XXXXX -- correct missing? */
+
+ add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
+ lookahead_len = string_len(token->tag_lookahead);
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ if (lookahead_len > MAX_DOCTYPE_VALUE) {
+
+ token->tag_state = ts_tag_doctype_bogus;
+ /* Forget it */
+
+ free_string(& (token->tag_lookahead));
+ EMIT_TOKEN_CLASS(tf_doctype_error);
+ }
+
+ CHECK_SBUFFER_LEN(tf_doctype_segment);
+ }
+ }
+
+ } else if (ts_tag_after_quoted == token->tag_state) {
+ /* After quoted value */
+
+ if ( 0x000A /* LF '\n' (new line) */ == u) {
+ /* Consume newline */
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ /* Parse new attribute instead */
+ token->tag_state = ts_tag_params;
+
+ EMIT_TOKEN_CLASS(tf_tag_space);
+ } else if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
+ 0x000C /* FF '\f' (form feed) */ == u ||
+ 0x0020 /* SPACE */ == u) {
+
+ /* Parse new attribute instead */
+ token->tag_state = ts_tag_params;
+
+ } else if (0x003E /* > */ == u ||
+ 0x002F /* / */ == u) {
+
+ token->tag_state = ts_tag_ending; /* Process for /> or > */
+
+ } else {
+ token->tag_state = ts_tag_params;
+
+ EMIT_TOKEN_CLASS(tf_tag_error);
+ }
+
+ } else if (ent_hexdec_ent == token->entity_state) {
+
+ uint16 v = 0;
+
+ if (0x0030 /* 0 */ <= u && u <= 0x0039 /* 9 */)
+ v = u - 0x0030 /* 0 */;
+ else if (0x0041 /* A */ <= u && u <= 0x0046 /* F */)
+ v = u - 0x0041 /* A */ + 10;
+ else if (0x0061 /* a */ <= u && u <= 0x0066 /* f */)
+ v = u - 0x0061 /* a */ + 10;
+ else if (0x003B /* ; */ == u) {
+
+ if (isoff(token->tag_flags,TFLAG_num_overflow)) {
+
+ token->entity_state = ent_init;
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ EMIT_TOKEN_CLASS(tf_numeric_entity);
+
+ } else {
+ token->numeric_reference = UNICODE_BAD_CHAR;
+ goto end_entity;
+ }
+ } else {
+ token->numeric_reference = UNICODE_BAD_CHAR;
+ goto end_entity;
+ }
+
+ /* Hexadecimal numeric character reference */
+
+ if (isoff(token->tag_flags,TFLAG_num_overflow)) {
+
+ if (token->numeric_reference > 0xFFFF / 16) {
+ setit(token->tag_flags,TFLAG_num_overflow);
+ token->numeric_reference = UNICODE_BAD_CHAR;
+ } else {
+ token->numeric_reference *= 16;
+ token->numeric_reference += v;
+ }
+ }
+
+ ADD_SBUFFER_CHAR(tf_entity_error);
+
+ } else if (ent_decimal_ent == token->entity_state) {
+
+ if (2 == len && ( 0x0078 /* x */ == u ||
+ 0x0058 /* X */ == u)) {
+
+ /* Hexadecimal numeric character reference */
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ token->entity_state = ent_hexdec_ent;
+
+ } else if (0x0030 /* 0 */ <= u && u <= 0x0039 /* 9 */) {
+
+ /* Decimal numeric character reference */
+
+ if (isoff(token->tag_flags,TFLAG_num_overflow)) {
+
+ if (token->numeric_reference > 0xFFFF / 10) {
+ setit(token->tag_flags,TFLAG_num_overflow);
+ token->numeric_reference = UNICODE_BAD_CHAR;
+ } else {
+ token->numeric_reference *= 10;
+ token->numeric_reference += u - 0x0030 /* 0 */;
+ }
+ }
+
+ ADD_SBUFFER_CHAR(tf_entity_error);
+ } else if (0x003B /* ; */ == u) {
+
+ if (isoff(token->tag_flags,TFLAG_num_overflow)) {
+
+ token->entity_state = ent_init;
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ EMIT_TOKEN_CLASS(tf_numeric_entity);
+
+ } else {
+ token->numeric_reference = UNICODE_BAD_CHAR;
+ goto end_entity;
+ }
+ } else {
+ token->numeric_reference = UNICODE_BAD_CHAR;
+ goto end_entity;
+ }
+
+ } else if (ent_entity_start == token->entity_state) {
+
+ /* & on body or on attribute value */
+
+ if (1 == len && 0x0023 /* # */ == u) {
+
+ /* Decimal numeric character reference */
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ token->entity_state = ent_decimal_ent;
+ token->numeric_reference = 0; /* Numeric reference */
+ clearit(token->tag_flags,TFLAG_num_overflow);
+
+ if (token->named_reference) /* Named reference including & ; */
+ free_string(&(token->named_reference));
+
+ if (token->walk_reference) /* Parsing of named reference */
+ free_name_entity_walk(&(token->walk_reference));
+
+ } else if (tagfilter_entity_character(u,NULL)) {
+
+ /* Named character references */
+
+ /* Returns no status */
+ add_state_to_string(token->named_reference,token->rchar);
+
+ ADD_SBUFFER_CHAR(tf_entity_error);
+
+ if (token->walk_reference) {
+ /* free'es struct name_entity_walk, if no match */
+ advance_entity_walk(& (token->walk_reference), u);
+ }
+
+ /* XXXXX --- not correct */
+
+ } else {
+
+ end_entity:
+ if (0x003B /* ; */ == u) {
+
+ if (token->named_reference) { /* Named reference including & ; */
+ /* Returns no status */
+ add_state_to_string(token->named_reference,token->rchar);
+ }
+
+ if (token->walk_reference) {
+ /* increment refcount */
+ token->match_reference =
+ tagfilter_match_reference(token->walk_reference,
+ tagfilter_match_semicolon);
+
+ free_name_entity_walk(&(token->walk_reference));
+ }
+
+ /* End of entity */
+ /* XXXX this accepts also no matching character references
+ even when there is shorter match
+ */
+
+ token->entity_state = ent_init;
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ EMIT_TOKEN_CLASS(tf_entity);
+ } else {
+
+ if (0x000A /* LF '\n' */ == u) {
+ /* Consume newline */
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ }
+
+ if (token->named_reference) {
+ /* &xxx= on tags are not interpreted as character reference */
+
+ if (ts_tag_atrvalue == token->tag_state &&
+ !token->atr_value_segment &&
+ 0x003D /* = */ == u) {
+
+ if (token->walk_reference) /* Parsing of named reference */
+ free_name_entity_walk(&(token->walk_reference));
+
+ token->atr_value_segment = token->named_reference;
+ token->named_reference = NULL;
+ token->entity_state = ent_init;
+
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+
+ free_string(&(token->named_reference));
+ }
+
+ if (token->walk_reference) {
+ /* is this acceptable without ;
+
+ XXXX this is parse error, but it is ignored
+ */
+
+ /* increment refcount */
+ token->match_reference =
+ tagfilter_match_reference(token->walk_reference,
+ tagfilter_match_prefix);
+
+
+ free_name_entity_walk(&(token->walk_reference));
+
+ if (token->match_reference) {
+ token->entity_state = ent_init;
+ EMIT_TOKEN_CLASS(tf_entity);
+ }
+ }
+
+ /* XXXX Only handled on body */
+ if (token->named_reference && ts_init == token->tag_state) {
+ int reflen,i;
+
+ reflen = string_len(token->named_reference);
+ /* Perhaps there was shorter match */
+ token->walk_reference = tagfilter_start_reference(tagfilter->have_entities);
+
+ for (i = 0; i < reflen; i++) {
+ uint16 x = give_unicode_from_string(token->named_reference,i);
+
+ /* Skip */
+ if (0x0026 /* & */ == x && 0 == i)
+ continue;
+
+ /* increment refcount */
+ token->match_reference =
+ tagfilter_match_reference(token->walk_reference,
+ tagfilter_match_prefix);
+
+ if (token->match_reference) {
+ /* XXXX this founds shortest prefix only */ +
+ /* XXXX this is parse error, but it is ignored */
+
+ struct string * temp = token->named_reference;
+ int POS = 0;
+
+ token->named_reference = clip_from_string(temp,&POS,i);
+ token->resubmit = clip_from_string(temp,&POS,reflen);
+
+ free_string(&temp);
+ free_name_entity_walk(&(token->walk_reference));
+
+ token->entity_state = ent_init;
+ EMIT_TOKEN_CLASS(tf_entity);
+ }
+
+ /* free'es struct name_entity_walk, if no match */
+ advance_entity_walk(& (token->walk_reference), x);
+
+ if (! token->walk_reference)
+ break; /* Scan failed */
+ }
+
+ if (token->walk_reference) /* Scan was not endded */
+ free_name_entity_walk(&(token->walk_reference));
+ }
+
+ /* Parse error on entity -- do not include character */ +
+ token->entity_state = ent_init;
+
+ EMIT_TOKEN_CLASS(tf_entity_error);
+ }
+ }
+
+
+ } else if (ts_tag_atrvalue == token->tag_state) {
+
+ int seglen = 0;
+
+ if (!token->atr_value_segment) /* one part of arribute value */
+ token->atr_value_segment = new_string(token->text_charset); +
+ seglen = string_len(token->atr_value_segment);
+
+ if (0x000A /* LF '\n' (new line) */ == u) {
+
+ if (token->tag_quote) {
+
+ add_state_to_string(token->atr_value_segment,token->rchar); /* Returns no status */
+ seglen = string_len(token->atr_value_segment);
+
+ /* Consume newline */
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+
+ } else {
+ if (seglen > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+
+ free_string(& token->atr_value_segment);
+
+ /* Consume newline */
+
+ if (token->atr_name) /* Reset name of attribute on tag */
+ free_string(&(token->atr_name));
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ /* Parse new attribute instead */
+ token->tag_state = ts_tag_params;
+
+ /* Value ends */
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
+ }
+
+ } else if (! token->tag_quote &&
+ (0x0009 /* HT '\t' (horizontal tab) */ == u ||
+ 0x000C /* FF '\f' (form feed) */ == u ||
+ 0x0020 /* SPACE */ == u)) {
+
+ if (seglen > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+
+ free_string(& token->atr_value_segment);
+
+ if (token->atr_name) /* Reset name of attribute on tag */
+ free_string(&(token->atr_name));
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ /* Parse new attribute instead */
+ token->tag_state = ts_tag_params;
+
+ /* Value ends */
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
+
+ } else if (token->tag_quote &&
+ token->tag_quote == u) {
+
+ if (seglen > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+
+ free_string(& token->atr_value_segment);
+
+ if (token->atr_name) /* Reset name of attribute on tag */
+ free_string(&(token->atr_name));
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+ token->tag_quote = 0;
+ token->tag_state = ts_tag_after_quoted;
+
+ /* Value ends */
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
+
+ } else if (0x0026 /* & */ == u &&
+ tagfilter->have_entities) {
+
+ if (seglen > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+
+ free_string(& token->atr_value_segment);
+
+
+ token->entity_state = ent_entity_start;
+
+ /* Start & .... */
+ token->named_reference = new_string(token->text_charset);
+ /* Returns no status */
+ add_state_to_string(token->named_reference,token->rchar);
+ token->walk_reference = tagfilter_start_reference(tagfilter->have_entities);
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ } else if (! token->tag_quote &&
+ 0x003E /* > */ == u) {
+
+ if (seglen > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+
+ free_string(& token->atr_value_segment);
+
+ if (token->atr_name) /* Reset name of attribute on tag */
+ free_string(&(token->atr_name));
+
+
+ /* Will emit tag end */
+ token->tag_state = ts_tag_ending;
+
+ /* Value ends */
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
+
+
+ } else if (0x0000 /* NUL */ == u) {
+ uint16 bad_char = UNICODE_BAD_CHAR;
+
+ DPRINT(Debug,20,(&Debug,
+ "get_new_tagfilter_token: Found NUL character when parsing attribute value\n"));
+
+ reset_state(token->rchar,0); /* Get next character */
+
+ token->error = 1;
+
+ add_unicode_to_string(token->atr_value_segment,1,&bad_char);
+ seglen = string_len(token->atr_value_segment);
+
+ add_unicode_to_string(token->sbuffer,1,&bad_char);
+
+ if (seglen >= MAX_VALUE_SEGMENT) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+ CHECK_SBUFFER_LEN(tf_tag_atrvalue_error);
+
+ } else if (! token->tag_quote &&
+ (0x0022 /* " */ == u ||
+ 0x0027 /* ' */ == u ||
+ 0x003C /* < */ == u ||
+ 0x003D /* = */ == u ||
+ 0x0060 /* ` */ == u)) {
+
+ if (seglen > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+
+ add_state_to_string(token->atr_value_segment,token->rchar); /* Returns no status */
+ ADD_SBUFFER_CHAR_NOCHECK;
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_error);
+
+ } else {
+ add_state_to_string(token->atr_value_segment,token->rchar); /* Returns no status */
+ seglen = string_len(token->atr_value_segment);
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ if (seglen >= MAX_VALUE_SEGMENT) {
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
+ }
+ CHECK_SBUFFER_LEN(tf_tag_atrvalue_error);
+ }
+
+ } else if (ts_tag_after_atrname == token->tag_state) {
+ /* Got attribute name, parse = */
+
+ if (0x000A /* LF '\n' (new line) */ == u) {
+ /* Consume newline */
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ EMIT_TOKEN_CLASS(tf_tag_space);
+
+ } else if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
+ 0x000C /* FF '\f' (form feed) */ == u ||
+ 0x0020 /* SPACE */ == u) {
+
+ ADD_SBUFFER_CHAR(tf_tag_space);
+
+ } else if (isoff(token->tag_flags,TFLAG_seen_equals) &&
+ 0x002F /* / */ == u) {
+
+ /* / do not end when parsing unquoted atribute value? */
+
+ token->tag_state = ts_tag_ending; /* Process for /> or > */
+
+ if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_space);
+ }
+
+ } else if (0x003E /* > */ == u) {
+
+ /* If = seen this is error */
+
+ token->tag_state = ts_tag_ending; /* Process for /> or > */
+
+ if (ison(token->tag_flags,TFLAG_seen_equals)) {
+ EMIT_TOKEN_CLASS(tf_tag_error);
+ } else if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_space);
+ }
+ } else if (isoff(token->tag_flags,TFLAG_seen_equals) &&
+ 0x003D /* = */ == u) {
+
+ /* Emit first spaces */
+
+ if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_space);
+ }
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+ setit(token->tag_flags,TFLAG_seen_equals);
+ EMIT_TOKEN_CLASS(tf_tag_atrequal);
+
+ } else if (ison(token->tag_flags,TFLAG_seen_equals) &&
+ (0x0022 /* " */ == u ||
+ 0x0027 /* ' */ == u)) {
+ if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_space);
+ }
+
+ clearit(token->tag_flags,TFLAG_seen_equals); /* Forget = */ +
+ ADD_SBUFFER_CHAR_NOCHECK;
+ token->tag_quote = u;
+ token->tag_state = ts_tag_atrvalue;
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_start);
+
+ } else {
+
+ if (ison(token->tag_flags,TFLAG_seen_equals)) {
+ if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_space);
+ }
+
+ clearit(token->tag_flags,TFLAG_seen_equals); /* Forget = */
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+ token->tag_quote = 0;
+ token->tag_state = ts_tag_atrvalue;
+ EMIT_TOKEN_CLASS(tf_tag_atrvalue_start);
+ } else {
+ /* Parse new attribute instead */
+
+ if (token->atr_name) /* Reset name of attribute on tag */
+ free_string(&(token->atr_name));
+
+ clearit(token->tag_flags,TFLAG_seen_equals); /* Forget = */
+
+ token->tag_state = ts_tag_params;
+ if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_space);
+ }
+ }
+ }
+
+ } else if (ts_tag_ending == token->tag_state) {
+ /* Process for /> or > */
+
+ if (0x002F /* / */ == u) {
+
+ if (isoff(token->tag_flags,TFLAG_self_closing)) {
+ setit(token->tag_flags,TFLAG_self_closing);
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+ } else {
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ EMIT_TOKEN_CLASS(tf_tag_param_error);
+ }
+ } else if (0x003E /* > */ == u) {
+ token->tag_state = ts_init; /* Tag ended */
+
+ if (token->tag_name) /* Clear name of start or end tag */
+ free_string(&(token->tag_name));
+ if (token->atr_name) /* Clear name of attribute on tag */
+ free_string(&(token->atr_name));
+
+
+ if (ison(token->tag_flags,TFLAG_self_closing)) {
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ clearit(token->tag_flags,TFLAG_self_closing); /* Passed already */
+ EMIT_TOKEN_CLASS(tf_tag_selfclosed_end);
+ } else {
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ EMIT_TOKEN_CLASS(tf_tag_end);
+ }
+
+ } else {
+ /* Possible / without > */
+
+ token->tag_state = ts_tag_params;
+
+ EMIT_TOKEN_CLASS(tf_tag_param_error);
+ }
+
+ } else if (ts_tag_params == token->tag_state) {
+ /* Seen space after < or expecting params */
+
+ if (token->atr_name) { /* Name of attribute on tag */ +
+ if (0x000A /* LF '\n' (new line) */ == u) {
+ /* Consume newline */
+
+ token->have_nl = 1;
+ reset_state(token->rchar,0);
+
+ token->tag_state = ts_tag_after_atrname; /* Got attribute name, parse = */
+ EMIT_TOKEN_CLASS(tf_tag_atrname);
+
+ } else if (0x002F /* / */ == u ||
+ 0x003E /* > */ == u) {
+
+
+ token->tag_state = ts_tag_ending; /* Process for /> or > */
+
+ if (len > 0) {
+ EMIT_TOKEN_CLASS(tf_tag_atrname);
+ }
+
+ } else if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
+ 0x000C /* FF '\f' (form feed) */ == u ||
+ 0x0020 /* SPACE */ == u ||
+ 0x003D /* = */ == u) {
+
+ token->tag_state = ts_tag_after_atrname; /* Got attribute name, parse = */
+ EMIT_TOKEN_CLASS(tf_tag_atrname);
+
+ } else if (0x0022 /* " */ == u ||
+ 0x0027 /* ' */ == u ||
+ 0x003C /* < */ == u) {
+
+ /* Report as error, but collect to attribure name */
+
+ /* Returns no status */
+ add_state_to_string(token->atr_name,token->rchar);
+
+ ADD_SBUFFER_CHAR_NOCHECK;
+
+ /* Same time this is displayed? as error */
+
+ EMIT_TOKEN_CLASS(tf_tag_param_error);
+
+ } else if (0x0000 /* NUL */ == u) {
+ uint16 bad_char = UNICODE_BAD_CHAR;
+
+ DPRINT(Debug,20,(&Debug,
+ "get_new_tagfilter_token: Found NUL character when parsing attribute name\n"));
+
+ reset_state(token->rchar,0); /* Get next character */ +
+ token->error = 1;
+
+ add_unicode_to_string(token->atr_name,1,&bad_char);
+
+ add_unicode_to_string(token->sbuffer,1,&bad_char);
[continued in next message]
--- SoupGate-Win32 v1.05
* Origin: fsxNet Usenet Gateway (21:1/5)