I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc = encodes[(size_t)countl_zero<unsigned
char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
Bonita Montero <Bonita.Montero@gmail.com> wrote:
I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc = encodes[(size_t)countl_zero<unsigned
char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
How does it compare to the more straightforward:
std::size_t utf8Strlen(const char *str)
{
std::size_t length = 0;
for(std::size_t index = 0; str[index]; ++index, ++length)
if((str[index] & 0xC0) == 0xC0)
while((str[index+1] & 0xC0) == 0x80)
++index;
return length;
}
How does it compare to the more straightforward:
std::size_t utf8Strlen(const char *str)
{
std::size_t length = 0;
for(std::size_t index = 0; str[index]; ++index, ++length)
if((str[index] & 0xC0) == 0xC0)
while((str[index+1] & 0xC0) == 0x80)
++index;
return length;
}
Am 11.01.2022 um 10:50 schrieb Juha Nieminen:
How does it compare to the more straightforward:
std::size_t utf8Strlen(const char *str)
{
std::size_t length = 0;
for(std::size_t index = 0; str[index]; ++index, ++length)
if((str[index] & 0xC0) == 0xC0)
while((str[index+1] & 0xC0) == 0x80)
++index;
return length;
}
The issue with that solution is that it doesn't correctly detect
any kind of mis-formatted UTF-8-string. I get the number of chars
preceding a header-char from the table and check if there are an
according number of 0x80-headered chars.
Bonita Montero <Bonita.Montero@gmail.com> wrote:
I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc = encodes[(size_t)countl_zero<unsigned
char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
How does it compare to the more straightforward:
std::size_t utf8Strlen(const char *str)
{
std::size_t length = 0;
for(std::size_t index = 0; str[index]; ++index, ++length)
if((str[index] & 0xC0) == 0xC0)
while((str[index+1] & 0xC0) == 0x80)
++index;
return length;
}
Am 11.01.22 um 18:28 schrieb Bonita Montero:
Am 11.01.2022 um 10:50 schrieb Juha Nieminen:
How does it compare to the more straightforward:
std::size_t utf8Strlen(const char *str)
{
std::size_t length = 0;
for(std::size_t index = 0; str[index]; ++index, ++length)
if((str[index] & 0xC0) == 0xC0)
while((str[index+1] & 0xC0) == 0x80)
++index;
return length;
}
The issue with that solution is that it doesn't correctly detect
any kind of mis-formatted UTF-8-string. I get the number of chars
preceding a header-char from the table and check if there are an
according number of 0x80-headered chars.
I wrote this code once to check, if a string is valid UTF-8 - actually,
I used this platform to test-drive it: https://leetcode.com/problems/utf-8-validation/ and it came off as the fastest solution handed in so far:
==================================================================
enum utf8token { utf8lowbyte = 1, utf8doublet = 2, utf8triplet = 3, utf8quadruplet = 4, utf8highbyte, utf8fail };
static utf8token utf8classify(unsigned char data) {
if ((data & 0x80) == 0) { return utf8lowbyte; }
if ((data & 0xC0) == 0x80) { return utf8highbyte;}
if ((data & 0xE0) == 0xC0) { return utf8doublet; }
if ((data & 0xF0) == 0xE0) { return utf8triplet; }
if ((data & 0xF8) == 0xF0) { return utf8quadruplet; }
return utf8fail;
}
Am 11.01.2022 um 10:50 schrieb Juha Nieminen:
Bonita Montero <Bonita....@gmail.com> wrote:
I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc = encodes[(size_t)countl_zero<unsigned
char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
How does it compare to the more straightforward:
std::size_t utf8Strlen(const char *str)The issue with that solution is that it doesn't correctly detect
{
std::size_t length = 0;
for(std::size_t index = 0; str[index]; ++index, ++length)
if((str[index] & 0xC0) == 0xC0)
while((str[index+1] & 0xC0) == 0x80)
++index;
return length;
}
any kind of mis-formatted UTF-8-string. I get the number of chars
preceding a header-char from the table and check if there are an
according number of 0x80-headered chars.
sizes instead of size; corrected:
size_t utf8Strlen( char const *str )
{
static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
size_t len = 0;
for( unsigned char c; (c = *str); )
{
size_t size = sizes[countl_zero<unsigned char>( ~c )];
if( !size ) [[unlikely]]
return -1;
++len;
for( char const *cpEnd = str + size; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Am 12.01.2022 um 15:14 schrieb Ben Bacarisse:...
There are other encoding errors that this code won't catch.Not at the UTF-8 level.
Bonita Montero <Bonita.Montero@gmail.com> writes:
sizes instead of size; corrected:
size_t utf8Strlen( char const *str )
{
static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
size_t len = 0;
for( unsigned char c; (c = *str); )
{
size_t size = sizes[countl_zero<unsigned char>( ~c )];
if( !size ) [[unlikely]]
return -1;
++len;
for( char const *cpEnd = str + size; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
You reject simpler solutions because they don't detect the one error you
have decided to look for. ...
There are other encoding errors that this code won't catch.
Am 12.01.2022 um 15:14 schrieb Ben Bacarisse:
Bonita Montero <Bonita.Montero@gmail.com> writes:
sizes instead of size; corrected:You reject simpler solutions because they don't detect the one error you
size_t utf8Strlen( char const *str )
{
static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
size_t len = 0;
for( unsigned char c; (c = *str); )
{
size_t size = sizes[countl_zero<unsigned char>( ~c )];
if( !size ) [[unlikely]]
return -1;
++len;
for( char const *cpEnd = str + size; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
have decided to look for. ...
I think that's ok
There are other encoding errors that this code won't catch.
Not at the UTF-8 level.
Bonita Montero <Bonita.Montero@gmail.com> writes:
sizes instead of size; corrected:
size_t utf8Strlen( char const *str )
{
static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
size_t len = 0;
for( unsigned char c; (c = *str); )
{
size_t size = sizes[countl_zero<unsigned char>( ~c )];
if( !size ) [[unlikely]]
return -1;
++len;
for( char const *cpEnd = str + size; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
You reject simpler solutions because they don't detect the one error
you have decided to look for. There are other encoding errors that
this code won't catch. Seems a bit arbitrary to me. [...]
Bonita Montero <Bonita.Montero@gmail.com> writes:
sizes instead of size; corrected:
size_t utf8Strlen( char const *str )
{
static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
size_t len = 0;
for( unsigned char c; (c = *str); )
{
size_t size = sizes[countl_zero<unsigned char>( ~c )];
if( !size ) [[unlikely]]
return -1;
++len;
for( char const *cpEnd = str + size; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
You reject simpler solutions because they don't detect the one error
you have decided to look for. There are other encoding errors that
this code won't catch. Seems a bit arbitrary to me. [...]
Ben Bacarisse <ben.u...@bsb.me.uk> writes:
Bonita Montero <Bonita....@gmail.com> writes:
sizes instead of size; corrected:
size_t utf8Strlen( char const *str )
{
static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
size_t len = 0;
for( unsigned char c; (c = *str); )
{
size_t size = sizes[countl_zero<unsigned char>( ~c )];
if( !size ) [[unlikely]]
return -1;
++len;
for( char const *cpEnd = str + size; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
You reject simpler solutions because they don't detect the one error
you have decided to look for. There are other encoding errors that
this code won't catch. Seems a bit arbitrary to me. [...]
(Sorry, accidental send)
I wouldn't say arbitrary. The function verifies that its input is syntactically well-formed while ignoring the question of whether it
is semantically well-formed. That choice may not be ideal but I
don't think it's totally unreasonable.
On Sunday, 16 January 2022 at 22:13:01 UTC+2, Tim Rentsch wrote:
Ben Bacarisse <ben.u...@bsb.me.uk> writes:
Bonita Montero <Bonita....@gmail.com> writes:
sizes instead of size; corrected:
size_t utf8Strlen( char const *str )
{
static size_t const sizes[8] = { 1, 0, 2, 3, 4, 0, 0, 0 };
size_t len = 0;
for( unsigned char c; (c = *str); )
{
size_t size = sizes[countl_zero<unsigned char>( ~c )];
if( !size ) [[unlikely]]
return -1;
++len;
for( char const *cpEnd = str + size; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
You reject simpler solutions because they don't detect the one error
you have decided to look for. There are other encoding errors that
this code won't catch. Seems a bit arbitrary to me. [...]
(Sorry, accidental send)
I wouldn't say arbitrary. The function verifies that its input is
syntactically well-formed while ignoring the question of whether it
is semantically well-formed. That choice may not be ideal but I
don't think it's totally unreasonable.
With complicated syntax and semantics it makes sense
as the issues are easier to reason about in separation and the
expectations of users to diagnostics are also quite high.
UTF-8 is usually produced by software. It takes not too lot of
code to detect all issues in it. Usually the symbol ? is shown
as "diagnostic" for any issue in Unicode. [...]
So if some problem involving UTF-8 needs only half of error
checking then it is fair to expect it been said in problem
description.
I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc =
encodes[(size_t)countl_zero<unsigned char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
Bonita Montero <Bonita.Montero@gmail.com> writes:
I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc =
encodes[(size_t)countl_zero<unsigned char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
Just for fun -
size_t
utf8_units_two( const char *s ){
size_t r = -1;
unsigned char c;
next:
switch( r++, c = *s++, c >> 3 ){
cases(16,17,18,19,20,21,22,23,31): return -1;
cases(30): if( c = *s++, c >> 6 != 2 ) return -1;
cases(28,29): if( c = *s++, c >> 6 != 2 ) return -1;
cases(24,25,26,27): if( c = *s++, c >> 6 != 2 ) return -1;
cases(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15): goto next;
cases(0): if( c != 0 ) goto next;
}
return r;
}
(Note: the cases() macro produces one 'case X:' for each
argument X except the last one where the ':' is omitted,
written using the standard C preprocessor, and left as an
exercise for any ambitious readers.)
Incidentally, this code provides the only example I remember
where using 'goto' is pretty much unavoidable, in that any
re-writing without 'goto' seems awkward or inferior in some
other way. I guess I should say, at least not that I could
find, maybe someone else can do better.
This code also has the interesting property that along the main
code path there are no conditional branches (as compiled by gcc
under -O2).
On 17 Jan 2022 19:28, Tim Rentsch wrote:
Bonita Montero <Bonita.Montero@gmail.com> writes:
I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc =
encodes[(size_t)countl_zero<unsigned char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
Just for fun -
size_t
utf8_units_two( const char *s ){
size_t r = -1;
unsigned char c;
next:
switch( r++, c = *s++, c >> 3 ){
cases(16,17,18,19,20,21,22,23,31): return -1;
cases(30): if( c = *s++, c >> 6 != 2 ) return -1;
cases(28,29): if( c = *s++, c >> 6 != 2 ) return -1;
cases(24,25,26,27): if( c = *s++, c >> 6 != 2 ) return -1;
cases(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15): goto next;
cases(0): if( c != 0 ) goto next;
}
return r;
}
That's very ugly.
(Note: the cases() macro produces one 'case X:' for each
argument X except the last one where the ':' is omitted,
written using the standard C preprocessor, and left as an
exercise for any ambitious readers.)
Exercise for you: test that unrevealed macro with Visual C++.
(Making it work also with Visual C++ is doable.)
Incidentally, this code provides the only example I remember
where using 'goto' is pretty much unavoidable, in that any
re-writing without 'goto' seems awkward or inferior in some
other way. I guess I should say, at least not that I could
find, maybe someone else can do better.
The `for(;;)`, `continue;` and `break;` constructs come to mind, so
you're right, someone else can do better.
At a guess the code counts the number of Unicode code points?
Does it return -1 also for "too long" UTF-8 sequence?
This code also has the interesting property that along the main
code path there are no conditional branches (as compiled by gcc
under -O2).
Smart compiler removed all the `if`'s?
I guess you're flame-baiting a little just for fun, but I chose to
take it seriously.
On 17 Jan 2022 19:28, Tim Rentsch wrote:
(Note: the cases() macro produces one 'case X:' for each
argument X except the last one where the ':' is omitted,
written using the standard C preprocessor, and left as an
exercise for any ambitious readers.)
Exercise for you: test that unrevealed macro with Visual C++.
Bonita Montero <Bonita.Montero@gmail.com> writes:
I had an idea how to write a fast UTF-8 strlen, here it is:
size_t utf8Strlen( char const *str )
{
struct encode_t { size_t lenIncr, strIncr; };
static encode_t const encodes[] =
{
{ 1, 1 },
{ 0, 0 },
{ 1, 2 },
{ 1, 3 },
{ 1, 4 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 },
{ 0, 0 }
};
size_t len = 0;
for( unsigned char c; (c = *str); )
{
encode_t const &enc =
encodes[(size_t)countl_zero<unsigned char>( ~c )];
if( !enc.lenIncr ) [[unlikely]]
return -1;
len += enc.lenIncr;
for( char const *cpEnd = str + enc.strIncr; ++str != cpEnd; )
if( ((unsigned char)*str & 0x0C0) != 0x080 ) [[unlikely]]
return -1;
}
return len;
}
Has anyone further ideas to improve this ?
Just for fun -
size_t
utf8_units_two( const char *s ){
size_t r = -1;
unsigned char c;
next:
switch( r++, c = *s++, c >> 3 ){
cases(16,17,18,19,20,21,22,23,31): return -1;
cases(30): if( c = *s++, c >> 6 != 2 ) return -1;
cases(28,29): if( c = *s++, c >> 6 != 2 ) return -1;
cases(24,25,26,27): if( c = *s++, c >> 6 != 2 ) return -1;
cases(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15): goto next;
cases(0): if( c != 0 ) goto next;
}
return r;
}
"Alf P. Steinbach" <alf.p.steinbach@gmail.com> writes:
On 17 Jan 2022 19:28, Tim Rentsch wrote:
(Note: the cases() macro produces one 'case X:' for each
argument X except the last one where the ':' is omitted,
written using the standard C preprocessor, and left as an
exercise for any ambitious readers.)
Exercise for you: test that unrevealed macro with Visual C++.
Update after my previous message. The cases() macro I wrote has
now been tested with Visual C++, and it compiles there without
complaint.
Sysop: | Keyop |
---|---|
Location: | Huddersfield, West Yorkshire, UK |
Users: | 113 |
Nodes: | 8 (1 / 7) |
Uptime: | 32:42:33 |
Calls: | 2,498 |
Files: | 8,649 |
Messages: | 1,906,385 |