• Can anyone improve this ?

    From Bonita Montero@21:1/5 to All on Tue Jan 11 09:08:24 2022
    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc = encodes[(size_t)countl_zero<unsigned
    char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Juha Nieminen@21:1/5 to Bonita Montero on Tue Jan 11 09:50:34 2022
    Bonita Montero <Bonita.Montero@gmail.com> wrote:
    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc = encodes[(size_t)countl_zero<unsigned
    char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    How does it compare to the more straightforward:

    std::size_t utf8Strlen(const char *str)
    {
    std::size_t length = 0;
    for(std::size_t index = 0; str[index]; ++index, ++length)
    if((str[index] & 0xC0) == 0xC0)
    while((str[index+1] & 0xC0) == 0x80)
    ++index;
    return length;
    }

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Ben Bacarisse@21:1/5 to Juha Nieminen on Tue Jan 11 11:08:24 2022
    Juha Nieminen <nospam@thanks.invalid> writes:

    Bonita Montero <Bonita.Montero@gmail.com> wrote:
    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc = encodes[(size_t)countl_zero<unsigned
    char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    How does it compare to the more straightforward:

    std::size_t utf8Strlen(const char *str)
    {
    std::size_t length = 0;
    for(std::size_t index = 0; str[index]; ++index, ++length)
    if((str[index] & 0xC0) == 0xC0)
    while((str[index+1] & 0xC0) == 0x80)
    ++index;
    return length;
    }

    Or the even more straightforward:

    size_t ustrlen(char *s)
    {
    size_t len = 0;
    while (*s) len += (*s++ & 0xc0) != 0x80;
    return len;
    }

    --
    Ben.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Juha Nieminen@21:1/5 to Juha Nieminen on Tue Jan 11 11:07:33 2022
    Juha Nieminen <nospam@thanks.invalid> wrote:
    How does it compare to the more straightforward:

    std::size_t utf8Strlen(const char *str)
    {
    std::size_t length = 0;
    for(std::size_t index = 0; str[index]; ++index, ++length)
    if((str[index] & 0xC0) == 0xC0)
    while((str[index+1] & 0xC0) == 0x80)
    ++index;
    return length;
    }

    Actually there's an even simpler way:

    std::size_t utf8Strlen(const char *str)
    {
    std::size_t length = 0;
    for(std::size_t index = 0; str[index]; ++index)
    length += (str[index] & 0xC0) != 0x80;
    return length;
    }

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Christian Gollwitzer@21:1/5 to All on Tue Jan 11 18:41:45 2022
    Am 11.01.22 um 18:28 schrieb Bonita Montero:
    Am 11.01.2022 um 10:50 schrieb Juha Nieminen:
    How does it compare to the more straightforward:

    std::size_t utf8Strlen(const char *str)
    {
         std::size_t length = 0;
         for(std::size_t index = 0; str[index]; ++index, ++length)
             if((str[index] & 0xC0) == 0xC0)
                 while((str[index+1] & 0xC0) == 0x80)
                     ++index;
         return length;
    }

    The issue with that solution is that it doesn't correctly detect
    any kind of mis-formatted UTF-8-string. I get the number of chars
    preceding a header-char from the table and check if there are an
    according number of 0x80-headered chars.


    I wrote this code once to check, if a string is valid UTF-8 - actually,
    I used this platform to test-drive it: https://leetcode.com/problems/utf-8-validation/ and it came off as the
    fastest solution handed in so far:

    ==================================================================
    enum utf8token { utf8lowbyte = 1, utf8doublet = 2, utf8triplet = 3, utf8quadruplet = 4, utf8highbyte, utf8fail };

    static utf8token utf8classify(unsigned char data) {
    if ((data & 0x80) == 0) { return utf8lowbyte; }
    if ((data & 0xC0) == 0x80) { return utf8highbyte;}
    if ((data & 0xE0) == 0xC0) { return utf8doublet; }
    if ((data & 0xF0) == 0xE0) { return utf8triplet; }
    if ((data & 0xF8) == 0xF0) { return utf8quadruplet; }
    return utf8fail;
    }

    static bool valid_utf8(const char* data, std::size_t dataSize) {
    for (std::size_t i = 0; i < dataSize; i++) {
    int codelength = utf8classify(static_cast<unsigned char>(data[i]));
    if (codelength == utf8highbyte || codelength == utf8fail)
    return false;

    for (int j = 1; j<codelength; j++) {
    // check for premature end of input
    i++;
    if (i >= dataSize) return false;

    if (utf8classify(static_cast<unsigned char>(data[i])) != utf8highbyte)
    return false;
    }
    }

    return true;
    }
    ==========================================================

    It only returns true or false according to the problem definition, but I
    think you can easily rework it to count the number of characters. I
    believe all you have to do is increase a counter for every turn of the
    outer loop.

    Christian

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Bonita Montero@21:1/5 to All on Tue Jan 11 18:28:58 2022
    Am 11.01.2022 um 10:50 schrieb Juha Nieminen:
    Bonita Montero <Bonita.Montero@gmail.com> wrote:
    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc = encodes[(size_t)countl_zero<unsigned
    char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    How does it compare to the more straightforward:

    std::size_t utf8Strlen(const char *str)
    {
    std::size_t length = 0;
    for(std::size_t index = 0; str[index]; ++index, ++length)
    if((str[index] & 0xC0) == 0xC0)
    while((str[index+1] & 0xC0) == 0x80)
    ++index;
    return length;
    }

    The issue with that solution is that it doesn't correctly detect
    any kind of mis-formatted UTF-8-string. I get the number of chars
    preceding a header-char from the table and check if there are an
    according number of 0x80-headered chars.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Bonita Montero@21:1/5 to All on Tue Jan 11 19:36:46 2022
    Am 11.01.2022 um 18:41 schrieb Christian Gollwitzer:
    Am 11.01.22 um 18:28 schrieb Bonita Montero:
    Am 11.01.2022 um 10:50 schrieb Juha Nieminen:
    How does it compare to the more straightforward:

    std::size_t utf8Strlen(const char *str)
    {
         std::size_t length = 0;
         for(std::size_t index = 0; str[index]; ++index, ++length)
             if((str[index] & 0xC0) == 0xC0)
                 while((str[index+1] & 0xC0) == 0x80)
                     ++index;
         return length;
    }

    The issue with that solution is that it doesn't correctly detect
    any kind of mis-formatted UTF-8-string. I get the number of chars
    preceding a header-char from the table and check if there are an
    according number of 0x80-headered chars.


    I wrote this code once to check, if a string is valid UTF-8 - actually,
    I used this platform to test-drive it: https://leetcode.com/problems/utf-8-validation/ and it came off as the fastest solution handed in so far:

    ==================================================================
    enum utf8token { utf8lowbyte = 1, utf8doublet = 2, utf8triplet = 3, utf8quadruplet = 4, utf8highbyte, utf8fail };

    static utf8token utf8classify(unsigned char data) {
        if ((data & 0x80) == 0) { return utf8lowbyte; }
        if ((data & 0xC0) == 0x80) { return utf8highbyte;}
        if ((data & 0xE0) == 0xC0) { return utf8doublet; }
        if ((data & 0xF0) == 0xE0) { return utf8triplet; }
        if ((data & 0xF8) == 0xF0) { return utf8quadruplet; }
        return utf8fail;
    }

    Sorry, that makes a lot of unprecitible branches.
    That's while I used a table.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Bonita Montero@21:1/5 to All on Tue Jan 11 21:51:17 2022
    I think that's even simpler:

    size_t utf8Strlen( char const *str )
    {
    static bool const isHeader[8] = { true, false, true, true, true, false, false, false };
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t headerClass = countl_zero<unsigned char>( ~c );
    if( !isHeader[headerClass] ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + sizes[headerClass]; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From =?UTF-8?B?w5bDtiBUaWli?=@21:1/5 to Bonita Montero on Tue Jan 11 21:54:55 2022
    On Tuesday, 11 January 2022 at 19:29:19 UTC+2, Bonita Montero wrote:
    Am 11.01.2022 um 10:50 schrieb Juha Nieminen:
    Bonita Montero <Bonita....@gmail.com> wrote:
    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc = encodes[(size_t)countl_zero<unsigned
    char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    How does it compare to the more straightforward:

    std::size_t utf8Strlen(const char *str)
    {
    std::size_t length = 0;
    for(std::size_t index = 0; str[index]; ++index, ++length)
    if((str[index] & 0xC0) == 0xC0)
    while((str[index+1] & 0xC0) == 0x80)
    ++index;
    return length;
    }
    The issue with that solution is that it doesn't correctly detect
    any kind of mis-formatted UTF-8-string. I get the number of chars
    preceding a header-char from the table and check if there are an
    according number of 0x80-headered chars.

    Yes, your code indeed detects some possible errors in encoding,
    but it does not detect overlong encodings and sequences that
    decode to an invalid code point. So both are only safe to use with
    guaranteed clean data and with it the outcome is same.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Bonita Montero@21:1/5 to All on Wed Jan 12 14:20:23 2022
    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Bonita Montero@21:1/5 to All on Wed Jan 12 14:19:14 2022
    The ultimately simple solution:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !sizes ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Ben Bacarisse@21:1/5 to Bonita Montero on Wed Jan 12 14:14:23 2022
    Bonita Montero <Bonita.Montero@gmail.com> writes:

    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    You reject simpler solutions because they don't detect the one error you
    have decided to look for. There are other encoding errors that this
    code won't catch. Seems a bit arbitrary to me. I'd want a fast
    utf8-strlen for external strings that have been validated and for
    strings internally generated by valid code, or a slower one that caught
    all invalid encodings.

    --
    Ben.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From jameskuyper@alumni.caltech.edu@21:1/5 to Bonita Montero on Wed Jan 12 08:38:44 2022
    On Wednesday, January 12, 2022 at 11:20:55 AM UTC-5, Bonita Montero wrote:
    Am 12.01.2022 um 15:14 schrieb Ben Bacarisse:
    ...
    There are other encoding errors that this code won't catch.
    Not at the UTF-8 level.

    "... it does not detect overlong encodings and sequences that decode to an invalid code point." To what level do you assign those errors, if not at the UTF-8 level? It's the specification of UTF-8 itself which identifies those as errors.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Bonita Montero@21:1/5 to All on Wed Jan 12 17:20:41 2022
    Am 12.01.2022 um 15:14 schrieb Ben Bacarisse:
    Bonita Montero <Bonita.Montero@gmail.com> writes:

    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    You reject simpler solutions because they don't detect the one error you
    have decided to look for. ...

    I think that's ok

    There are other encoding errors that this code won't catch.

    Not at the UTF-8 level.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Ben Bacarisse@21:1/5 to Bonita Montero on Wed Jan 12 16:55:04 2022
    Bonita Montero <Bonita.Montero@gmail.com> writes:

    Am 12.01.2022 um 15:14 schrieb Ben Bacarisse:
    Bonita Montero <Bonita.Montero@gmail.com> writes:

    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }
    You reject simpler solutions because they don't detect the one error you
    have decided to look for. ...

    I think that's ok

    Obviously.

    There are other encoding errors that this code won't catch.

    Not at the UTF-8 level.

    Not so.

    --
    Ben.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Tim Rentsch@21:1/5 to Ben Bacarisse on Sun Jan 16 12:12:45 2022
    Ben Bacarisse <ben.usenet@bsb.me.uk> writes:

    Bonita Montero <Bonita.Montero@gmail.com> writes:

    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    You reject simpler solutions because they don't detect the one error
    you have decided to look for. There are other encoding errors that
    this code won't catch. Seems a bit arbitrary to me. [...]

    (Sorry, accidental send)

    I wouldn't say arbitrary. The function verifies that its input is syntactically well-formed while ignoring the question of whether it
    is semantically well-formed. That choice may not be ideal but I
    don't think it's totally unreasonable.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Tim Rentsch@21:1/5 to Ben Bacarisse on Sun Jan 16 12:09:13 2022
    Ben Bacarisse <ben.usenet@bsb.me.uk> writes:

    Bonita Montero <Bonita.Montero@gmail.com> writes:

    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    You reject simpler solutions because they don't detect the one error
    you have decided to look for. There are other encoding errors that
    this code won't catch. Seems a bit arbitrary to me. [...]

    I wouldn't say arbitrary. The function verifies that its input is syntactically well-formed while ignoring the question of whether it
    is semantically well-formed. That choice may not be an ideal choice but I don't think it is an unreasonable one.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From =?UTF-8?B?w5bDtiBUaWli?=@21:1/5 to Tim Rentsch on Sun Jan 16 13:00:15 2022
    On Sunday, 16 January 2022 at 22:13:01 UTC+2, Tim Rentsch wrote:
    Ben Bacarisse <ben.u...@bsb.me.uk> writes:

    Bonita Montero <Bonita....@gmail.com> writes:

    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    You reject simpler solutions because they don't detect the one error
    you have decided to look for. There are other encoding errors that
    this code won't catch. Seems a bit arbitrary to me. [...]

    (Sorry, accidental send)
    I wouldn't say arbitrary. The function verifies that its input is syntactically well-formed while ignoring the question of whether it
    is semantically well-formed. That choice may not be ideal but I
    don't think it's totally unreasonable.

    With complicated syntax and semantics it makes sense
    as the issues are easier to reason about in separation and the
    expectations of users to diagnostics are also quite high.

    UTF-8 is usually produced by software. It takes not too lot of
    code to detect all issues in it. Usually the symbol � is shown
    as "diagnostic" for any issue in Unicode. Articles like that <https://arxiv.org/pdf/2010.03090.pdf> claim that whole
    UTF-8 validation is doable with less than one instruction
    per byte validated.

    So if some problem involving UTF-8 needs only half of error
    checking then it is fair to expect it been said in problem
    description.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Tim Rentsch@21:1/5 to Tiib on Mon Jan 17 10:13:28 2022
    Tiib <ootiib@hot.ee> writes:

    On Sunday, 16 January 2022 at 22:13:01 UTC+2, Tim Rentsch wrote:

    Ben Bacarisse <ben.u...@bsb.me.uk> writes:

    Bonita Montero <Bonita....@gmail.com> writes:

    sizes instead of size; corrected:

    size_t utf8Strlen( char const *str )
    {
    static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    size_t size = sizes[countl_zero<unsigned char>( ~c )];
    if( !size ) [[unlikely]]
    return -1;
    ++len;
    for( char const *cpEnd = str + size; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    You reject simpler solutions because they don't detect the one error
    you have decided to look for. There are other encoding errors that
    this code won't catch. Seems a bit arbitrary to me. [...]

    (Sorry, accidental send)
    I wouldn't say arbitrary. The function verifies that its input is
    syntactically well-formed while ignoring the question of whether it
    is semantically well-formed. That choice may not be ideal but I
    don't think it's totally unreasonable.

    With complicated syntax and semantics it makes sense
    as the issues are easier to reason about in separation and the
    expectations of users to diagnostics are also quite high.

    UTF-8 is usually produced by software. It takes not too lot of
    code to detect all issues in it. Usually the symbol ? is shown
    as "diagnostic" for any issue in Unicode. [...]

    So if some problem involving UTF-8 needs only half of error
    checking then it is fair to expect it been said in problem
    description.

    Yes, I know you have no shortage of strong opinions.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Tim Rentsch@21:1/5 to Bonita Montero on Mon Jan 17 10:28:48 2022
    Bonita Montero <Bonita.Montero@gmail.com> writes:

    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc =
    encodes[(size_t)countl_zero<unsigned char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    Just for fun -

    size_t
    utf8_units_two( const char *s ){
    size_t r = -1;
    unsigned char c;

    next:
    switch( r++, c = *s++, c >> 3 ){
    cases(16,17,18,19,20,21,22,23,31): return -1;

    cases(30): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(28,29): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(24,25,26,27): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15): goto next;

    cases(0): if( c != 0 ) goto next;
    }

    return r;
    }


    (Note: the cases() macro produces one 'case X:' for each
    argument X except the last one where the ':' is omitted,
    written using the standard C preprocessor, and left as an
    exercise for any ambitious readers.)

    Incidentally, this code provides the only example I remember
    where using 'goto' is pretty much unavoidable, in that any
    re-writing without 'goto' seems awkward or inferior in some
    other way. I guess I should say, at least not that I could
    find, maybe someone else can do better.

    This code also has the interesting property that along the main
    code path there are no conditional branches (as compiled by gcc
    under -O2).

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Alf P. Steinbach@21:1/5 to Tim Rentsch on Mon Jan 17 21:51:37 2022
    On 17 Jan 2022 19:28, Tim Rentsch wrote:
    Bonita Montero <Bonita.Montero@gmail.com> writes:

    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc =
    encodes[(size_t)countl_zero<unsigned char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    Just for fun -

    size_t
    utf8_units_two( const char *s ){
    size_t r = -1;
    unsigned char c;

    next:
    switch( r++, c = *s++, c >> 3 ){
    cases(16,17,18,19,20,21,22,23,31): return -1;

    cases(30): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(28,29): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(24,25,26,27): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15): goto next;

    cases(0): if( c != 0 ) goto next;
    }

    return r;
    }

    That's very ugly.


    (Note: the cases() macro produces one 'case X:' for each
    argument X except the last one where the ':' is omitted,
    written using the standard C preprocessor, and left as an
    exercise for any ambitious readers.)

    Exercise for you: test that unrevealed macro with Visual C++.

    (Making it work also with Visual C++ is doable.)


    Incidentally, this code provides the only example I remember
    where using 'goto' is pretty much unavoidable, in that any
    re-writing without 'goto' seems awkward or inferior in some
    other way. I guess I should say, at least not that I could
    find, maybe someone else can do better.

    The `for(;;)`, `continue;` and `break;` constructs come to mind, so
    you're right, someone else can do better.

    At a guess the code counts the number of Unicode code points?

    Does it return -1 also for "too long" UTF-8 sequence?


    This code also has the interesting property that along the main
    code path there are no conditional branches (as compiled by gcc
    under -O2).

    Smart compiler removed all the `if`'s?

    I guess you're flame-baiting a little just for fun, but I chose to take
    it seriously.

    - Alf

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Tim Rentsch@21:1/5 to Alf P. Steinbach on Mon Jan 17 16:16:18 2022
    "Alf P. Steinbach" <alf.p.steinbach@gmail.com> writes:

    On 17 Jan 2022 19:28, Tim Rentsch wrote:

    Bonita Montero <Bonita.Montero@gmail.com> writes:

    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc =
    encodes[(size_t)countl_zero<unsigned char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    Just for fun -

    size_t
    utf8_units_two( const char *s ){
    size_t r = -1;
    unsigned char c;

    next:
    switch( r++, c = *s++, c >> 3 ){
    cases(16,17,18,19,20,21,22,23,31): return -1;

    cases(30): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(28,29): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(24,25,26,27): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15): goto next;

    cases(0): if( c != 0 ) goto next;
    }

    return r;
    }

    That's very ugly.

    Beauty is in the eye of the beholder. Personally I think it's
    rather pretty.

    (Note: the cases() macro produces one 'case X:' for each
    argument X except the last one where the ':' is omitted,
    written using the standard C preprocessor, and left as an
    exercise for any ambitious readers.)

    Exercise for you: test that unrevealed macro with Visual C++.

    I don't have access to a Visual C++ system. The macro I wrote
    compiles under gcc and g++ as C99, C11, C++11, C++14, and C++17,
    using a -pedantic-errors option. (Using clang and clang++ gives
    the same results.)

    (Making it work also with Visual C++ is doable.)

    Does this need more than just writing the definition in
    conforming C11 or C++11?

    Incidentally, this code provides the only example I remember
    where using 'goto' is pretty much unavoidable, in that any
    re-writing without 'goto' seems awkward or inferior in some
    other way. I guess I should say, at least not that I could
    find, maybe someone else can do better.

    The `for(;;)`, `continue;` and `break;` constructs come to mind, so
    you're right, someone else can do better.

    It is of course possible to write the function without using any
    'goto's. Whether such a writing is an improvement is another
    matter. To my eye the goto version expresses the structure more
    directly and in a form that is easier to comprehend. I did try
    actually writing a version with a continue-able loop around the
    switch(), but it looked clumsy and not as easy to follow as the
    goto version. If you have an example to post I definitely would
    like to see it.

    At a guess the code counts the number of Unicode code points?

    Yes, sorry, I thought that was apparent from the earlier
    context.

    Does it return -1 also for "too long" UTF-8 sequence?

    The -1 return values is for character sequences that are not
    syntactically well-formed for UTF-8.

    This code also has the interesting property that along the main
    code path there are no conditional branches (as compiled by gcc
    under -O2).

    Smart compiler removed all the `if`'s?

    I guess you're flame-baiting a little just for fun, but I chose to
    take it seriously.

    I'm not flame-baiting at all. By "main code path" I mean that
    part of the code that deals with normal ASCII characters (and not
    counting the terminating null character). The switch() statement
    generates an unconditional indirect branch, indexing a table of
    32 targets, and the 15 targets for normal ASCII characters simply
    go around the loop again, with no tests. I expect you can see
    that based on the line with 15 cases, which just does a 'goto'
    to start the loop again.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Tim Rentsch@21:1/5 to Alf P. Steinbach on Mon Jan 17 22:43:04 2022
    "Alf P. Steinbach" <alf.p.steinbach@gmail.com> writes:

    On 17 Jan 2022 19:28, Tim Rentsch wrote:

    (Note: the cases() macro produces one 'case X:' for each
    argument X except the last one where the ':' is omitted,
    written using the standard C preprocessor, and left as an
    exercise for any ambitious readers.)

    Exercise for you: test that unrevealed macro with Visual C++.

    Update after my previous message. The cases() macro I wrote has
    now been tested with Visual C++, and it compiles there without
    complaint.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Bonita Montero@21:1/5 to All on Tue Jan 18 09:18:35 2022
    Am 17.01.2022 um 19:28 schrieb Tim Rentsch:
    Bonita Montero <Bonita.Montero@gmail.com> writes:

    I had an idea how to write a fast UTF-8 strlen, here it is:

    size_t utf8Strlen( char const *str )
    {
    struct encode_t { size_t lenIncr, strIncr; };
    static encode_t const encodes[] =
    {
    { 1, 1 },
    { 0, 0 },
    { 1, 2 },
    { 1, 3 },
    { 1, 4 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 },
    { 0, 0 }
    };
    size_t len = 0;
    for( unsigned char c; (c = *str); )
    {
    encode_t const &enc =
    encodes[(size_t)countl_zero<unsigned char>( ~c )];
    if( !enc.lenIncr ) [[unlikely]]
    return -1;
    len += enc.lenIncr;
    for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
    if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
    return -1;
    }
    return len;
    }

    Has anyone further ideas to improve this ?

    Just for fun -

    size_t
    utf8_units_two( const char *s ){
    size_t r = -1;
    unsigned char c;

    next:
    switch( r++, c = *s++, c >> 3 ){
    cases(16,17,18,19,20,21,22,23,31): return -1;

    cases(30): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(28,29): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(24,25,26,27): if( c = *s++, c >> 6 != 2 ) return -1;
    cases(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15): goto next;

    cases(0): if( c != 0 ) goto next;
    }

    return r;
    }

    That's not C or C++ and it does have a lot of branch-mispredictions.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Alf P. Steinbach@21:1/5 to Tim Rentsch on Tue Jan 18 12:09:01 2022
    On 18 Jan 2022 07:43, Tim Rentsch wrote:
    "Alf P. Steinbach" <alf.p.steinbach@gmail.com> writes:

    On 17 Jan 2022 19:28, Tim Rentsch wrote:

    (Note: the cases() macro produces one 'case X:' for each
    argument X except the last one where the ':' is omitted,
    written using the standard C preprocessor, and left as an
    exercise for any ambitious readers.)

    Exercise for you: test that unrevealed macro with Visual C++.

    Update after my previous message. The cases() macro I wrote has
    now been tested with Visual C++, and it compiles there without
    complaint.

    Good to hear. Visual C++ has or had somewhat non-standard treatment of
    token pasting and variadic macros, and either it's been fixed, or your
    code avoided running into it. A comment in some old code of mine says that

    ❞ [Visual C++ 2017] is unable to count `__VA_ARGS__` as *n* arguments,
    and instead counts it as 1. Mostly.

    Counting the number of arguments of a variadic macro was AFAIK invented
    by Laurent Deniau in 2006, and it can go like this:


    #define N_ARGUMENTS( ... ) \
    INVOKE_MACRO( \
    ARGUMENT_64, \
    ( \
    __VA_ARGS__, \
    63, 62, 61, 60, \
    59, 58, 57, 56, 55, 54, 53, 52, 51, 50, \
    49, 48, 47, 46, 45, 44, 43, 42, 41, 40, \
    39, 38, 37, 36, 35, 34, 33, 32, 31, 30, \
    29, 28, 27, 26, 25, 24, 23, 22, 21, 20, \
    19, 18, 17, 16, 15, 14, 13, 12, 11, 10, \
    9, 8, 7, 6, 5, 4, 3, 2, 1, 0 \
    ) \
    )

    #define ARGUMENT_64( \
    a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, \
    a11, a12, a13, a14, a15, a16, a17, a18, a19, a20, \
    a21, a22, a23, a24, a25, a26, a27, a28, a29, a30, \
    a31, a32, a33, a34, a35, a36, a37, a38, a39, a40, \
    a41, a42, a43, a44, a45, a46, a47, a48, a49, a50, \
    a51, a52, a53, a54, a55, a56, a57, a58, a59, a60, \
    a61, a62, a63, a64, ... ) \
    a64


    ... where `INVOKE_MACRO` tackles the Visual C++ quirks:


    /// \brief Invokes the specified macro with the specified arguments list.
    /// \hideinitializer
    /// \param m The name of a macro to invoke.
    /// \param arglist A parenthesized list of arguments. Can be empty.
    ///
    /// The only difference between `INVOKE_MACRO` and `INVOKE_MACRO_B` is
    that they're
    /// *different* macros. One may have to use both in order to guarantee
    macro expansion in
    /// certain (not very well defined) situations.

    #define INVOKE_MACRO( m, arglist ) \
    m arglist

    #define INVOKE_MACRO_B( m, arglist ) \
    m arglist


    I've removed macro name prefixes in the above. For reusable code there
    better be name prefixes, to reduce or avoid name collisions.

    - Alf

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)