• Patch: Elm ME+ 2.5 PLalpha62 -> Elm ME+ 2.5 PLalpha63 [4/7] (3/7)

    From Kari Hurtta@21:1/5 to All on Wed Jan 3 13:52:11 2024
    [continued from previous message]

    + free_name_entity_match(& (token->match_reference));
    + token->numeric_reference = UNICODE_BAD_CHAR; /* Numeric reference */ + if (token->doctype_item) /* DOCTYPE information splitten */
    + free_string(&(token->doctype_item));
    +
    +
    + do {
    + /* NOTE: It is assumed that CRLF are same than on ascii */
    + int ch = EOF;
    +
    + if (state_ready(token->rchar)) {
    +
    + /* Have one chararacter collected */
    +
    + uint16 u = give_unicode_from_state(token->rchar);
    +
    + len = string_len(token->sbuffer);
    +
    + DPRINT(Debug,22,(&Debug,
    + "get_new_tagfilter_token: u=%04u",
    + u));
    +
    + if (state_printable(token->rchar)) {
    + DPRINT(Debug,22,(&Debug," '%C'",
    + token->rchar));
    + }
    + DPRINT(Debug,22,(&Debug," tag_state=%d",
    + token->tag_state));
    + switch (token->tag_state) {
    + case ts_init: DPRINT(Debug,22,(&Debug," ts_init")); break;
    + case ts_tag_start: DPRINT(Debug,22,(&Debug," ts_tag_start")); break;
    + case ts_tag_bang: DPRINT(Debug,22,(&Debug," ts_tag_bang")); break;
    + case ts_tag_endmark: DPRINT(Debug,22,(&Debug," ts_tag_endmark")); break;
    + case ts_tag_params: DPRINT(Debug,22,(&Debug," ts_tag_params")); break;
    + case ts_tag_ending: DPRINT(Debug,22,(&Debug," ts_tag_ending")); break;
    + case ts_tag_bogus_comment: DPRINT(Debug,22,(&Debug," ts_tag_bogus_comment"));
    + break;
    + case ts_tag_self_closing: DPRINT(Debug,22,(&Debug," ts_tag_self_closing"));
    + break;
    + case ts_tag_comment_start: DPRINT(Debug,22,(&Debug," ts_tag_comment_start"));
    + break;
    + case ts_tag_comment: DPRINT(Debug,22,(&Debug," ts_tag_comment")); break;
    + case ts_tag_after_atrname: DPRINT(Debug,22,(&Debug," ts_tag_after_atrname"));
    + break;
    + case ts_tag_atrvalue: DPRINT(Debug,22,(&Debug," ts_tag_atrvalue"));
    + break;
    + case ts_tag_after_quoted: DPRINT(Debug,22,(&Debug," ts_tag_after_quoted"));
    + break;
    + case ts_tag_doctype_start: DPRINT(Debug,22,(&Debug," ts_tag_doctype_start"));
    + break;
    + case ts_tag_doctype_bogus: DPRINT(Debug,22,(&Debug," ts_tag_doctype_bogus"));
    + break;
    + }
    + DPRINT(Debug,22,(&Debug," entity_state=%d",token->entity_state));
    + switch (token->entity_state) {
    + case ent_init: DPRINT(Debug,22,(&Debug," ent_init")); break;
    + case ent_entity_start: DPRINT(Debug,22,(&Debug," ent_entity_start")); break;
    + case ent_decimal_ent: DPRINT(Debug,22,(&Debug," ent_decimal_ent")); break;
    + case ent_hexdec_ent: DPRINT(Debug,22,(&Debug," ent_hexdec_ent")); break;
    + }
    + DPRINT(Debug,22,(&Debug," tag_flags=%d",token->tag_flags));
    + if (ison(token->tag_flags, TFLAG_seen_nl)) {
    + DPRINT(Debug,22,(&Debug," TFLAG_seen_nl"));
    + }
    + if (ison(token->tag_flags, TFLAG_self_closing)) {
    + DPRINT(Debug,22,(&Debug," TFLAG_self_closing"));
    + }
    + if (ison(token->tag_flags, TFLAG_seen_equals)) {
    + DPRINT(Debug,22,(&Debug," TFLAG_seen_equals"));
    + }
    + if (ison(token->tag_flags, TFLAG_num_overflow)) {
    + DPRINT(Debug,22,(&Debug," TFLAG_num_overflow"));
    + }
    + if (ison(token->tag_flags, TFLAG_unhandled_class)) {
    + DPRINT(Debug,22,(&Debug," TFLAG_unhandled_class"));
    + }
    + DPRINT(Debug,22,(&Debug," tag_quote=%d\n",
    + token->tag_quote));
    +
    + if (0x000D /* CR '\r' */ == u) {
    + int peekch = state_getc(state_in);
    +
    + if ('\n' == peekch) { /* Got newline */
    + ch = peekch;
    + reset_state(token->rchar,0);
    + goto process_char;
    +
    + } else if (EOF != peekch)
    + state_ungetc(peekch,state_in);
    +
    + if (ison(tagfilter->tagflt_mode,TAGFLT_MODE_convert_cr))
    + u = 0x000A /* LF '\n' */;
    + }
    +
    + #define CHECK_SBUFFER_LEN(error_class) do { \
    + len = string_len(token->sbuffer); \
    + if (len >= MAX_TAG_TOKEN) { \
    + DPRINT(Debug,20,(&Debug, \
    + "get_new_tagfilter_token: Read %d characters\n", \
    + len)); \
    + ret = len; \
    + token->token_class = error_class; \
    + goto out; \
    + } \
    + } while(0)
    +
    + #define ADD_SBUFFER_CHAR(error_class) do { \
    + add_state_to_string(token->sbuffer,token->rchar); /* Returns no status */ \
    + reset_state(token->rchar,0); /* Get next character */ \
    + CHECK_SBUFFER_LEN(error_class); \
    + } while(0)
    +
    + #define ADD_SBUFFER_CHAR_NOCHECK do { \
    + add_state_to_string(token->sbuffer,token->rchar); /* Returns no status */ \
    + reset_state(token->rchar,0); /* Get next character */ \
    + len = string_len(token->sbuffer); \
    + } while (0)
    +
    + #define EMIT_TOKEN_CLASS(class) do { \
    + ret = len; \
    + token->token_class = class; \
    + goto out; \
    + } while (0)
    +
    + if (ison(token->tag_flags,TFLAG_seen_nl)) {
    +
    + /* Start newline span */
    +
    + if (0x000A /* LF '\n' */ == u) {
    +
    + ADD_SBUFFER_CHAR(tf_span_nl);
    +
    + } else {
    +
    + clearit(token->tag_flags, TFLAG_seen_nl);
    +
    + if (len > 0) {
    + /* got newline span */
    +
    + EMIT_TOKEN_CLASS(tf_span_nl);
    + }
    + }
    +
    +
    + } else if (ts_tag_doctype_bogus == token->tag_state) {
    + /* Bogus DOCTYPE line */
    +
    + if (token->tag_lookahead)
    + free_string(& (token->tag_lookahead));
    +
    + if (0x003E /* > */ == u) {
    +
    + token->tag_state = ts_init; /* Tag ended */
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + EMIT_TOKEN_CLASS(tf_doctype_end);
    +
    + } else if ( 0x000A /* LF '\n' (new line) */ == u) {
    + /* Consume newline */
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + EMIT_TOKEN_CLASS(tf_doctype_segment);
    +
    + } else if (0x0000 /* NUL */ == u) {
    + uint16 bad_char = UNICODE_BAD_CHAR;
    +
    + DPRINT(Debug,20,(&Debug,
    + "get_new_tagfilter_token: Found NUL character when parsing doctype line\n"));
    +
    + reset_state(token->rchar,0); /* Get next character */
    + token->error = 1;
    + add_unicode_to_string(token->sbuffer,1,&bad_char);
    +
    + CHECK_SBUFFER_LEN(tf_doctype_segment);
    + } else {
    + ADD_SBUFFER_CHAR(tf_doctype_segment);
    + }
    +
    + } else if (ts_tag_doctype_start == token->tag_state) {
    +
    + int lookahead_len = 0;
    +
    + /* Seen <!DOCTYPE */
    +
    + if (token->tag_lookahead)
    + lookahead_len = string_len(token->tag_lookahead);
    +
    + /* Not correct .... */
    +
    + if (0x003E /* > */ == u) {
    +
    + if (token->tag_quote) {
    +
    + /* If > inside of quoted value, treate as error */
    +
    + if (token->tag_lookahead)
    + free_string(& (token->tag_lookahead));
    +
    + token->tag_state = ts_tag_doctype_bogus;
    + EMIT_TOKEN_CLASS(tf_doctype_error);
    + }
    +
    + if (token->tag_lookahead) {
    + token->doctype_item = token->tag_lookahead;
    + token->tag_lookahead = NULL;
    + EMIT_TOKEN_CLASS(tf_doctype_item);
    +
    + }
    +
    + token->tag_state = ts_init; /* Tag ended */
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + EMIT_TOKEN_CLASS(tf_doctype_end);
    +
    + } else if (0x000A /* LF '\n' (new line) */ == u) {
    +
    +
    + if (token->tag_quote && token->tag_lookahead) {
    + add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
    + }
    +
    + /* Consume newline */
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + if (token->tag_quote && token->tag_lookahead) {
    +
    + lookahead_len = string_len(token->tag_lookahead);
    + if (lookahead_len > MAX_DOCTYPE_VALUE) {
    +
    + token->tag_state = ts_tag_doctype_bogus;
    + /* Forget it */
    +
    + free_string(& (token->tag_lookahead));
    + EMIT_TOKEN_CLASS(tf_doctype_error);
    + }
    +
    + EMIT_TOKEN_CLASS(tf_doctype_segment);
    +
    + } else if (token->tag_lookahead) {
    +
    + token->doctype_item = token->tag_lookahead;
    + token->tag_lookahead = NULL;
    + EMIT_TOKEN_CLASS(tf_doctype_item);
    +
    + } else {
    + EMIT_TOKEN_CLASS(tf_doctype_space);
    + }
    +
    + } else if (0x0000 /* NUL */ == u) {
    + uint16 bad_char = UNICODE_BAD_CHAR;
    +
    + if (!token->tag_lookahead) {
    + len = string_len(token->sbuffer);
    + if (len > 0) {
    + DPRINT(Debug,20,(&Debug,
    + "get_new_tagfilter_token: Found NUL character when parsing doctype line - flushing buffer before it\n"));
    + EMIT_TOKEN_CLASS(tf_doctype_space);
    + }
    + }
    +
    + DPRINT(Debug,20,(&Debug,
    + "get_new_tagfilter_token: Found NUL character when parsing doctype line\n"));
    +
    + reset_state(token->rchar,0); /* Get next character */
    + token->error = 1;
    +
    + add_unicode_to_string(token->sbuffer,1,&bad_char);
    +
    + if (!token->tag_lookahead)
    + token->tag_lookahead = new_string(token->text_charset);
    +
    + add_unicode_to_string(token->tag_lookahead,1,&bad_char);
    + lookahead_len = string_len(token->tag_lookahead);
    + if (lookahead_len > MAX_DOCTYPE_VALUE) {
    +
    + token->tag_state = ts_tag_doctype_bogus;
    + /* Forget it */
    +
    + free_string(& (token->tag_lookahead));
    + EMIT_TOKEN_CLASS(tf_doctype_error);
    + }
    +
    + } else if (token->tag_quote && token->tag_lookahead) {
    +
    + /* Does not detect if there is no space after quote */
    +
    + if (token->tag_quote == u) {
    + token->tag_quote = 0;
    + add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + token->doctype_item = token->tag_lookahead;
    + token->tag_lookahead = NULL;
    + EMIT_TOKEN_CLASS(tf_doctype_item);
    + } else {
    +
    + add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
    + lookahead_len = string_len(token->tag_lookahead);
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + if (lookahead_len > MAX_DOCTYPE_VALUE) {
    +
    + token->tag_state = ts_tag_doctype_bogus;
    + /* Forget it */
    +
    + free_string(& (token->tag_lookahead));
    + EMIT_TOKEN_CLASS(tf_doctype_error);
    + }
    +
    + CHECK_SBUFFER_LEN(tf_doctype_segment);
    + }
    +
    + } else {
    + if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
    + 0x000C /* FF '\f' (form feed) */ == u ||
    + 0x0020 /* SPACE */ == u) {
    +
    + if (token->tag_lookahead) {
    + token->doctype_item = token->tag_lookahead;
    + token->tag_lookahead = NULL;
    + EMIT_TOKEN_CLASS(tf_doctype_item);
    +
    + }
    +
    + ADD_SBUFFER_CHAR(tf_doctype_space);
    +
    + } else if (0x0022 /* " */ == u ||
    + 0x0027 /* ' */ == u) {
    +
    + /* Not correct parsing because this does not check
    + was that part what was required to be quoted
    + */
    +
    + if (token->tag_lookahead) {
    +
    + token->tag_state = ts_tag_doctype_bogus;
    + /* Forget it */
    +
    + free_string(& (token->tag_lookahead));
    + EMIT_TOKEN_CLASS(tf_doctype_error);
    +
    + } else {
    +
    + len = string_len(token->sbuffer);
    + if (len > 0) {
    + DPRINT(Debug,20,(&Debug,
    + "get_new_tagfilter_token: Flushing buffer before quoted item when parsing doctype line\n"));
    + EMIT_TOKEN_CLASS(tf_doctype_space);
    + }
    +
    + token->tag_lookahead = new_string(token->text_charset);
    +
    + add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
    + token->tag_quote = u;
    + }
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + } else {
    +
    + /* Enable buffer */
    +
    + if (!token->tag_lookahead) {
    + len = string_len(token->sbuffer);
    + if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_doctype_space);
    + }
    +
    + token->tag_lookahead = new_string(token->text_charset);
    + }
    +
    + /* XXXXX -- correct missing? */
    +
    + add_state_to_string(token->tag_lookahead,token->rchar); /* Returns no status */
    + lookahead_len = string_len(token->tag_lookahead);
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + if (lookahead_len > MAX_DOCTYPE_VALUE) {
    +
    + token->tag_state = ts_tag_doctype_bogus;
    + /* Forget it */
    +
    + free_string(& (token->tag_lookahead));
    + EMIT_TOKEN_CLASS(tf_doctype_error);
    + }
    +
    + CHECK_SBUFFER_LEN(tf_doctype_segment);
    + }
    + }
    +
    + } else if (ts_tag_after_quoted == token->tag_state) {
    + /* After quoted value */
    +
    + if ( 0x000A /* LF '\n' (new line) */ == u) {
    + /* Consume newline */
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + /* Parse new attribute instead */
    + token->tag_state = ts_tag_params;
    +
    + EMIT_TOKEN_CLASS(tf_tag_space);
    + } else if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
    + 0x000C /* FF '\f' (form feed) */ == u ||
    + 0x0020 /* SPACE */ == u) {
    +
    + /* Parse new attribute instead */
    + token->tag_state = ts_tag_params;
    +
    + } else if (0x003E /* > */ == u ||
    + 0x002F /* / */ == u) {
    +
    + token->tag_state = ts_tag_ending; /* Process for /> or > */
    +
    + } else {
    + token->tag_state = ts_tag_params;
    +
    + EMIT_TOKEN_CLASS(tf_tag_error);
    + }
    +
    + } else if (ent_hexdec_ent == token->entity_state) {
    +
    + uint16 v = 0;
    +
    + if (0x0030 /* 0 */ <= u && u <= 0x0039 /* 9 */)
    + v = u - 0x0030 /* 0 */;
    + else if (0x0041 /* A */ <= u && u <= 0x0046 /* F */)
    + v = u - 0x0041 /* A */ + 10;
    + else if (0x0061 /* a */ <= u && u <= 0x0066 /* f */)
    + v = u - 0x0061 /* a */ + 10;
    + else if (0x003B /* ; */ == u) {
    +
    + if (isoff(token->tag_flags,TFLAG_num_overflow)) {
    +
    + token->entity_state = ent_init;
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + EMIT_TOKEN_CLASS(tf_numeric_entity);
    +
    + } else {
    + token->numeric_reference = UNICODE_BAD_CHAR;
    + goto end_entity;
    + }
    + } else {
    + token->numeric_reference = UNICODE_BAD_CHAR;
    + goto end_entity;
    + }
    +
    + /* Hexadecimal numeric character reference */
    +
    + if (isoff(token->tag_flags,TFLAG_num_overflow)) {
    +
    + if (token->numeric_reference > 0xFFFF / 16) {
    + setit(token->tag_flags,TFLAG_num_overflow);
    + token->numeric_reference = UNICODE_BAD_CHAR;
    + } else {
    + token->numeric_reference *= 16;
    + token->numeric_reference += v;
    + }
    + }
    +
    + ADD_SBUFFER_CHAR(tf_entity_error);
    +
    + } else if (ent_decimal_ent == token->entity_state) {
    +
    + if (2 == len && ( 0x0078 /* x */ == u ||
    + 0x0058 /* X */ == u)) {
    +
    + /* Hexadecimal numeric character reference */
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + token->entity_state = ent_hexdec_ent;
    +
    + } else if (0x0030 /* 0 */ <= u && u <= 0x0039 /* 9 */) {
    +
    + /* Decimal numeric character reference */
    +
    + if (isoff(token->tag_flags,TFLAG_num_overflow)) {
    +
    + if (token->numeric_reference > 0xFFFF / 10) {
    + setit(token->tag_flags,TFLAG_num_overflow);
    + token->numeric_reference = UNICODE_BAD_CHAR;
    + } else {
    + token->numeric_reference *= 10;
    + token->numeric_reference += u - 0x0030 /* 0 */;
    + }
    + }
    +
    + ADD_SBUFFER_CHAR(tf_entity_error);
    + } else if (0x003B /* ; */ == u) {
    +
    + if (isoff(token->tag_flags,TFLAG_num_overflow)) {
    +
    + token->entity_state = ent_init;
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + EMIT_TOKEN_CLASS(tf_numeric_entity);
    +
    + } else {
    + token->numeric_reference = UNICODE_BAD_CHAR;
    + goto end_entity;
    + }
    + } else {
    + token->numeric_reference = UNICODE_BAD_CHAR;
    + goto end_entity;
    + }
    +
    + } else if (ent_entity_start == token->entity_state) {
    +
    + /* & on body or on attribute value */
    +
    + if (1 == len && 0x0023 /* # */ == u) {
    +
    + /* Decimal numeric character reference */
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + token->entity_state = ent_decimal_ent;
    + token->numeric_reference = 0; /* Numeric reference */
    + clearit(token->tag_flags,TFLAG_num_overflow);
    +
    + if (token->named_reference) /* Named reference including & ; */
    + free_string(&(token->named_reference));
    +
    + if (token->walk_reference) /* Parsing of named reference */
    + free_name_entity_walk(&(token->walk_reference));
    +
    + } else if (tagfilter_entity_character(u,NULL)) {
    +
    + /* Named character references */
    +
    + /* Returns no status */
    + add_state_to_string(token->named_reference,token->rchar);
    +
    + ADD_SBUFFER_CHAR(tf_entity_error);
    +
    + if (token->walk_reference) {
    + /* free'es struct name_entity_walk, if no match */
    + advance_entity_walk(& (token->walk_reference), u);
    + }
    +
    + /* XXXXX --- not correct */
    +
    + } else {
    +
    + end_entity:
    + if (0x003B /* ; */ == u) {
    +
    + if (token->named_reference) { /* Named reference including & ; */
    + /* Returns no status */
    + add_state_to_string(token->named_reference,token->rchar);
    + }
    +
    + if (token->walk_reference) {
    + /* increment refcount */
    + token->match_reference =
    + tagfilter_match_reference(token->walk_reference,
    + tagfilter_match_semicolon);
    +
    + free_name_entity_walk(&(token->walk_reference));
    + }
    +
    + /* End of entity */
    + /* XXXX this accepts also no matching character references
    + even when there is shorter match
    + */
    +
    + token->entity_state = ent_init;
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + EMIT_TOKEN_CLASS(tf_entity);
    + } else {
    +
    + if (0x000A /* LF '\n' */ == u) {
    + /* Consume newline */
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + }
    +
    + if (token->named_reference) {
    + /* &xxx= on tags are not interpreted as character reference */
    +
    + if (ts_tag_atrvalue == token->tag_state &&
    + !token->atr_value_segment &&
    + 0x003D /* = */ == u) {
    +
    + if (token->walk_reference) /* Parsing of named reference */
    + free_name_entity_walk(&(token->walk_reference));
    +
    + token->atr_value_segment = token->named_reference;
    + token->named_reference = NULL;
    + token->entity_state = ent_init;
    +
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    +
    + free_string(&(token->named_reference));
    + }
    +
    + if (token->walk_reference) {
    + /* is this acceptable without ;
    +
    + XXXX this is parse error, but it is ignored
    + */
    +
    + /* increment refcount */
    + token->match_reference =
    + tagfilter_match_reference(token->walk_reference,
    + tagfilter_match_prefix);
    +
    +
    + free_name_entity_walk(&(token->walk_reference));
    +
    + if (token->match_reference) {
    + token->entity_state = ent_init;
    + EMIT_TOKEN_CLASS(tf_entity);
    + }
    + }
    +
    + /* XXXX Only handled on body */
    + if (token->named_reference && ts_init == token->tag_state) {
    + int reflen,i;
    +
    + reflen = string_len(token->named_reference);
    + /* Perhaps there was shorter match */
    + token->walk_reference = tagfilter_start_reference(tagfilter->have_entities);
    +
    + for (i = 0; i < reflen; i++) {
    + uint16 x = give_unicode_from_string(token->named_reference,i);
    +
    + /* Skip */
    + if (0x0026 /* & */ == x && 0 == i)
    + continue;
    +
    + /* increment refcount */
    + token->match_reference =
    + tagfilter_match_reference(token->walk_reference,
    + tagfilter_match_prefix);
    +
    + if (token->match_reference) {
    + /* XXXX this founds shortest prefix only */ +
    + /* XXXX this is parse error, but it is ignored */
    +
    + struct string * temp = token->named_reference;
    + int POS = 0;
    +
    + token->named_reference = clip_from_string(temp,&POS,i);
    + token->resubmit = clip_from_string(temp,&POS,reflen);
    +
    + free_string(&temp);
    + free_name_entity_walk(&(token->walk_reference));
    +
    + token->entity_state = ent_init;
    + EMIT_TOKEN_CLASS(tf_entity);
    + }
    +
    + /* free'es struct name_entity_walk, if no match */
    + advance_entity_walk(& (token->walk_reference), x);
    +
    + if (! token->walk_reference)
    + break; /* Scan failed */
    + }
    +
    + if (token->walk_reference) /* Scan was not endded */
    + free_name_entity_walk(&(token->walk_reference));
    + }
    +
    + /* Parse error on entity -- do not include character */ +
    + token->entity_state = ent_init;
    +
    + EMIT_TOKEN_CLASS(tf_entity_error);
    + }
    + }
    +
    +
    + } else if (ts_tag_atrvalue == token->tag_state) {
    +
    + int seglen = 0;
    +
    + if (!token->atr_value_segment) /* one part of arribute value */
    + token->atr_value_segment = new_string(token->text_charset); +
    + seglen = string_len(token->atr_value_segment);
    +
    + if (0x000A /* LF '\n' (new line) */ == u) {
    +
    + if (token->tag_quote) {
    +
    + add_state_to_string(token->atr_value_segment,token->rchar); /* Returns no status */
    + seglen = string_len(token->atr_value_segment);
    +
    + /* Consume newline */
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    +
    + } else {
    + if (seglen > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    +
    + free_string(& token->atr_value_segment);
    +
    + /* Consume newline */
    +
    + if (token->atr_name) /* Reset name of attribute on tag */
    + free_string(&(token->atr_name));
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + /* Parse new attribute instead */
    + token->tag_state = ts_tag_params;
    +
    + /* Value ends */
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
    + }
    +
    + } else if (! token->tag_quote &&
    + (0x0009 /* HT '\t' (horizontal tab) */ == u ||
    + 0x000C /* FF '\f' (form feed) */ == u ||
    + 0x0020 /* SPACE */ == u)) {
    +
    + if (seglen > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    +
    + free_string(& token->atr_value_segment);
    +
    + if (token->atr_name) /* Reset name of attribute on tag */
    + free_string(&(token->atr_name));
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + /* Parse new attribute instead */
    + token->tag_state = ts_tag_params;
    +
    + /* Value ends */
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
    +
    + } else if (token->tag_quote &&
    + token->tag_quote == u) {
    +
    + if (seglen > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    +
    + free_string(& token->atr_value_segment);
    +
    + if (token->atr_name) /* Reset name of attribute on tag */
    + free_string(&(token->atr_name));
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    + token->tag_quote = 0;
    + token->tag_state = ts_tag_after_quoted;
    +
    + /* Value ends */
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
    +
    + } else if (0x0026 /* & */ == u &&
    + tagfilter->have_entities) {
    +
    + if (seglen > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    +
    + free_string(& token->atr_value_segment);
    +
    +
    + token->entity_state = ent_entity_start;
    +
    + /* Start & .... */
    + token->named_reference = new_string(token->text_charset);
    + /* Returns no status */
    + add_state_to_string(token->named_reference,token->rchar);
    + token->walk_reference = tagfilter_start_reference(tagfilter->have_entities);
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + } else if (! token->tag_quote &&
    + 0x003E /* > */ == u) {
    +
    + if (seglen > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    +
    + free_string(& token->atr_value_segment);
    +
    + if (token->atr_name) /* Reset name of attribute on tag */
    + free_string(&(token->atr_name));
    +
    +
    + /* Will emit tag end */
    + token->tag_state = ts_tag_ending;
    +
    + /* Value ends */
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_end);
    +
    +
    + } else if (0x0000 /* NUL */ == u) {
    + uint16 bad_char = UNICODE_BAD_CHAR;
    +
    + DPRINT(Debug,20,(&Debug,
    + "get_new_tagfilter_token: Found NUL character when parsing attribute value\n"));
    +
    + reset_state(token->rchar,0); /* Get next character */
    +
    + token->error = 1;
    +
    + add_unicode_to_string(token->atr_value_segment,1,&bad_char);
    + seglen = string_len(token->atr_value_segment);
    +
    + add_unicode_to_string(token->sbuffer,1,&bad_char);
    +
    + if (seglen >= MAX_VALUE_SEGMENT) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    + CHECK_SBUFFER_LEN(tf_tag_atrvalue_error);
    +
    + } else if (! token->tag_quote &&
    + (0x0022 /* " */ == u ||
    + 0x0027 /* ' */ == u ||
    + 0x003C /* < */ == u ||
    + 0x003D /* = */ == u ||
    + 0x0060 /* ` */ == u)) {
    +
    + if (seglen > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    +
    + add_state_to_string(token->atr_value_segment,token->rchar); /* Returns no status */
    + ADD_SBUFFER_CHAR_NOCHECK;
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_error);
    +
    + } else {
    + add_state_to_string(token->atr_value_segment,token->rchar); /* Returns no status */
    + seglen = string_len(token->atr_value_segment);
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + if (seglen >= MAX_VALUE_SEGMENT) {
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_segment);
    + }
    + CHECK_SBUFFER_LEN(tf_tag_atrvalue_error);
    + }
    +
    + } else if (ts_tag_after_atrname == token->tag_state) {
    + /* Got attribute name, parse = */
    +
    + if (0x000A /* LF '\n' (new line) */ == u) {
    + /* Consume newline */
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + EMIT_TOKEN_CLASS(tf_tag_space);
    +
    + } else if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
    + 0x000C /* FF '\f' (form feed) */ == u ||
    + 0x0020 /* SPACE */ == u) {
    +
    + ADD_SBUFFER_CHAR(tf_tag_space);
    +
    + } else if (isoff(token->tag_flags,TFLAG_seen_equals) &&
    + 0x002F /* / */ == u) {
    +
    + /* / do not end when parsing unquoted atribute value? */
    +
    + token->tag_state = ts_tag_ending; /* Process for /> or > */
    +
    + if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_space);
    + }
    +
    + } else if (0x003E /* > */ == u) {
    +
    + /* If = seen this is error */
    +
    + token->tag_state = ts_tag_ending; /* Process for /> or > */
    +
    + if (ison(token->tag_flags,TFLAG_seen_equals)) {
    + EMIT_TOKEN_CLASS(tf_tag_error);
    + } else if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_space);
    + }
    + } else if (isoff(token->tag_flags,TFLAG_seen_equals) &&
    + 0x003D /* = */ == u) {
    +
    + /* Emit first spaces */
    +
    + if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_space);
    + }
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    + setit(token->tag_flags,TFLAG_seen_equals);
    + EMIT_TOKEN_CLASS(tf_tag_atrequal);
    +
    + } else if (ison(token->tag_flags,TFLAG_seen_equals) &&
    + (0x0022 /* " */ == u ||
    + 0x0027 /* ' */ == u)) {
    + if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_space);
    + }
    +
    + clearit(token->tag_flags,TFLAG_seen_equals); /* Forget = */ +
    + ADD_SBUFFER_CHAR_NOCHECK;
    + token->tag_quote = u;
    + token->tag_state = ts_tag_atrvalue;
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_start);
    +
    + } else {
    +
    + if (ison(token->tag_flags,TFLAG_seen_equals)) {
    + if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_space);
    + }
    +
    + clearit(token->tag_flags,TFLAG_seen_equals); /* Forget = */
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    + token->tag_quote = 0;
    + token->tag_state = ts_tag_atrvalue;
    + EMIT_TOKEN_CLASS(tf_tag_atrvalue_start);
    + } else {
    + /* Parse new attribute instead */
    +
    + if (token->atr_name) /* Reset name of attribute on tag */
    + free_string(&(token->atr_name));
    +
    + clearit(token->tag_flags,TFLAG_seen_equals); /* Forget = */
    +
    + token->tag_state = ts_tag_params;
    + if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_space);
    + }
    + }
    + }
    +
    + } else if (ts_tag_ending == token->tag_state) {
    + /* Process for /> or > */
    +
    + if (0x002F /* / */ == u) {
    +
    + if (isoff(token->tag_flags,TFLAG_self_closing)) {
    + setit(token->tag_flags,TFLAG_self_closing);
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    + } else {
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + EMIT_TOKEN_CLASS(tf_tag_param_error);
    + }
    + } else if (0x003E /* > */ == u) {
    + token->tag_state = ts_init; /* Tag ended */
    +
    + if (token->tag_name) /* Clear name of start or end tag */
    + free_string(&(token->tag_name));
    + if (token->atr_name) /* Clear name of attribute on tag */
    + free_string(&(token->atr_name));
    +
    +
    + if (ison(token->tag_flags,TFLAG_self_closing)) {
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + clearit(token->tag_flags,TFLAG_self_closing); /* Passed already */
    + EMIT_TOKEN_CLASS(tf_tag_selfclosed_end);
    + } else {
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + EMIT_TOKEN_CLASS(tf_tag_end);
    + }
    +
    + } else {
    + /* Possible / without > */
    +
    + token->tag_state = ts_tag_params;
    +
    + EMIT_TOKEN_CLASS(tf_tag_param_error);
    + }
    +
    + } else if (ts_tag_params == token->tag_state) {
    + /* Seen space after < or expecting params */
    +
    + if (token->atr_name) { /* Name of attribute on tag */ +
    + if (0x000A /* LF '\n' (new line) */ == u) {
    + /* Consume newline */
    +
    + token->have_nl = 1;
    + reset_state(token->rchar,0);
    +
    + token->tag_state = ts_tag_after_atrname; /* Got attribute name, parse = */
    + EMIT_TOKEN_CLASS(tf_tag_atrname);
    +
    + } else if (0x002F /* / */ == u ||
    + 0x003E /* > */ == u) {
    +
    +
    + token->tag_state = ts_tag_ending; /* Process for /> or > */
    +
    + if (len > 0) {
    + EMIT_TOKEN_CLASS(tf_tag_atrname);
    + }
    +
    + } else if (0x0009 /* HT '\t' (horizontal tab) */ == u ||
    + 0x000C /* FF '\f' (form feed) */ == u ||
    + 0x0020 /* SPACE */ == u ||
    + 0x003D /* = */ == u) {
    +
    + token->tag_state = ts_tag_after_atrname; /* Got attribute name, parse = */
    + EMIT_TOKEN_CLASS(tf_tag_atrname);
    +
    + } else if (0x0022 /* " */ == u ||
    + 0x0027 /* ' */ == u ||
    + 0x003C /* < */ == u) {
    +
    + /* Report as error, but collect to attribure name */
    +
    + /* Returns no status */
    + add_state_to_string(token->atr_name,token->rchar);
    +
    + ADD_SBUFFER_CHAR_NOCHECK;
    +
    + /* Same time this is displayed? as error */
    +
    + EMIT_TOKEN_CLASS(tf_tag_param_error);
    +
    + } else if (0x0000 /* NUL */ == u) {
    + uint16 bad_char = UNICODE_BAD_CHAR;
    +
    + DPRINT(Debug,20,(&Debug,
    + "get_new_tagfilter_token: Found NUL character when parsing attribute name\n"));
    +
    + reset_state(token->rchar,0); /* Get next character */ +
    + token->error = 1;
    +
    + add_unicode_to_string(token->atr_name,1,&bad_char);
    +
    + add_unicode_to_string(token->sbuffer,1,&bad_char);

    [continued in next message]

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)