• feature share : arbitrary-byte-safe array splitting in gawk-unicode mod

    From Kpop 2GM@21:1/5 to All on Sat Jun 4 00:49:55 2022
    gnu-gawk normally would generate warning messages when UTF-8 unsafe bytes are used in functions like length( ) , index( ), or match( ). At the same time, the default array splitting methodology involves either

    1.keeping UTF-8 characters intact, adding extra layer of complexity if you want to perform operations on the bytes, e.g. URL-encoding or base64 encoding (without having to resort to LC_ALL=C),

    or

    2. padding custom SEPs between every single byte, and wasting individual array cells for each and every ASCII safe alphanumeric when it's perfectly safe to take a chunk of them out, and substring out what you need.

    here's a scripting level solution that doesn't require custom gawk-extension libraries, allowing one to handle ANY arbitrary combination of binary bytes, in gawk-unicode mode, without triggering any warning messages, nor needing to manually suppress them.

    The BAU way takes 26-array cells, and you still have to deal with manually splitting up the UTF-8 characters.

    The new approach needs 20-cells, plus providing useful metadata to the coder -

    e.g. one can quickly identify array index # 4-7 as overly-long 3-byte sequence,

    | [ 4] { 1 } <( { \354 EC # utf-8 3-bytes lead } )>
    | [ 5] { 1 } <( { \210 88 # utf-8 cont-byte } )>
    | [ 6] { 1 } <( { \267 B7 # utf-8 cont-byte } )>
    | [ 7] { 1 } <( { \273 BB # utf-8 cont-byte } )>

    so together, it constitutes 1 valid UTF-8 3-byte character, specifically, U+C237, plus one extra utf-8 invalid byte.

    * caveat 1 : it acts up when using gawk -P flag, but should be mostly safe in gawk -e.

    * caveat 2 : it's not 100% fool-proof, in the sense I've only included 3 different custom-SEP pairs without using any of the named ones ,

    so you'll need to modify it if you absolutely ned it to handle any arbitrary input without the SEPs conflicting with the actual input data and leading to unexpected results.

    ***** the 0x dfxx only look like that cuz of legacy code-pages. they're individual 8-bit bytes .

    -- 0x df80 ( 57216 ) :: [ Ä ]
    -- 0x df81 ( 57217 ) :: [ Å ]
    -- 0x df82 ( 57218 ) :: [ Ç ]
    -- 0x df83 ( 57219 ) :: [ É ]
    -- 0x df84 ( 57220 ) :: [ Ñ ]
    -- 0x df85 ( 57221 ) :: [ Ö ]
    -- 0x df86 ( 57222 ) :: [ Ü ]
    -- 0x df87 ( 57223 ) :: [ á ]
    -- 0x df88 ( 57224 ) :: [ à ]
    -- 0x df89 ( 57225 ) :: [ â ]
    -- 0x df8a ( 57226 ) :: [ ä ]
    -- 0x df8b ( 57227 ) :: [ ã ]
    -- 0x df8c ( 57228 ) :: [ å ]
    -- 0x df8d ( 57229 ) :: [ ç ]
    -- 0x df8e ( 57230 ) :: [ é ]
    -- 0x df8f ( 57231 ) :: [ è ]
    -- 0x df90 ( 57232 ) :: [ ê ]
    -- 0x df91 ( 57233 ) :: [ ë ]
    -- 0x df92 ( 57234 ) :: [ í ]
    -- 0x df93 ( 57235 ) :: [ ì ]
    -- 0x df94 ( 57236 ) :: [ î ]
    -- 0x df95 ( 57237 ) :: [ ï ]
    -- 0x df96 ( 57238 ) :: [ ñ ]
    -- 0x df97 ( 57239 ) :: [ ó ]
    -- 0x df98 ( 57240 ) :: [ ò ]
    -- 0x df99 ( 57241 ) :: [ ô ]
    -- 0x df9a ( 57242 ) :: [ ö ]
    -- 0x df9b ( 57243 ) :: [ õ ]
    -- 0x df9c ( 57244 ) :: [ ú ]
    -- 0x df9d ( 57245 ) :: [ ù ]
    -- 0x df9e ( 57246 ) :: [ û ]
    -- 0x df9f ( 57247 ) :: [ ü ]
    -- 0x dfa0 ( 57248 ) :: [ † ]
    -- 0x dfa1 ( 57249 ) :: [ ° ]
    -- 0x dfa2 ( 57250 ) :: [ ¢ ]
    -- 0x dfa3 ( 57251 ) :: [ £ ]
    -- 0x dfa4 ( 57252 ) :: [ § ]
    -- 0x dfa5 ( 57253 ) :: [ • ]
    -- 0x dfa6 ( 57254 ) :: [ ¶ ]
    -- 0x dfa7 ( 57255 ) :: [ ß ]
    -- 0x dfa8 ( 57256 ) :: [ ® ]
    -- 0x dfa9 ( 57257 ) :: [ © ]
    -- 0x dfaa ( 57258 ) :: [ ™ ]
    -- 0x dfab ( 57259 ) :: [ ´ ]
    -- 0x dfac ( 57260 ) :: [ ¨ ]
    -- 0x dfad ( 57261 ) :: [ ≠ ]
    -- 0x dfae ( 57262 ) :: [ Æ ]
    -- 0x dfaf ( 57263 ) :: [ Ø ]
    -- 0x dfb0 ( 57264 ) :: [ ∞ ]
    -- 0x dfb1 ( 57265 ) :: [ ± ]
    -- 0x dfb2 ( 57266 ) :: [ ≤ ]
    -- 0x dfb3 ( 57267 ) :: [ ≥ ]
    -- 0x dfb4 ( 57268 ) :: [ ¥ ]
    -- 0x dfb5 ( 57269 ) :: [ µ ]
    -- 0x dfb6 ( 57270 ) :: [ ∂ ]
    -- 0x dfb7 ( 57271 ) :: [ ∑ ]
    -- 0x dfb8 ( 57272 ) :: [ ∏ ]
    -- 0x dfb9 ( 57273 ) :: [ π ]
    -- 0x dfba ( 57274 ) :: [ ∫ ]
    -- 0x dfbb ( 57275 ) :: [ ª ]
    -- 0x dfbc ( 57276 ) :: [ º ]
    -- 0x dfbd ( 57277 ) :: [ Ω ]
    -- 0x dfbe ( 57278 ) :: [ æ ]
    -- 0x dfbf ( 57279 ) :: [ ø ]
    -- 0x dfc0 ( 57280 ) :: [ ¿ ]
    -- 0x dfc1 ( 57281 ) :: [ ¡ ]
    -- 0x dfc2 ( 57282 ) :: [ ¬ ]
    -- 0x dfc3 ( 57283 ) :: [ √ ]
    -- 0x dfc4 ( 57284 ) :: [ ƒ ]
    -- 0x dfc5 ( 57285 ) :: [ ≈ ]
    -- 0x dfc6 ( 57286 ) :: [ ∆ ]
    -- 0x dfc7 ( 57287 ) :: [ « ]
    -- 0x dfc8 ( 57288 ) :: [ » ]
    -- 0x dfc9 ( 57289 ) :: [ … ]
    -- 0x dfca ( 57290 ) :: [   ]
    -- 0x dfcb ( 57291 ) :: [ À ]
    -- 0x dfcc ( 57292 ) :: [ Ã ]
    -- 0x dfcd ( 57293 ) :: [ Õ ]
    -- 0x dfce ( 57294 ) :: [ Π]
    -- 0x dfcf ( 57295 ) :: [ œ ]
    -- 0x dfd0 ( 57296 ) :: [ – ]
    -- 0x dfd1 ( 57297 ) :: [ — ]
    -- 0x dfd2 ( 57298 ) :: [ “ ]
    -- 0x dfd3 ( 57299 ) :: [ ” ]
    -- 0x dfd4 ( 57300 ) :: [ ‘ ]
    -- 0x dfd5 ( 57301 ) :: [ ’ ]
    -- 0x dfd6 ( 57302 ) :: [ ÷ ]
    -- 0x dfd7 ( 57303 ) :: [ ◊ ]
    -- 0x dfd8 ( 57304 ) :: [ ÿ ]
    -- 0x dfd9 ( 57305 ) :: [ Ÿ ]
    -- 0x dfda ( 57306 ) :: [ ⁄ ]
    -- 0x dfdb ( 57307 ) :: [ € ]
    -- 0x dfdc ( 57308 ) :: [ ‹ ]
    -- 0x dfdd ( 57309 ) :: [ › ]
    -- 0x dfde ( 57310 ) :: [ fi ]
    -- 0x dfdf ( 57311 ) :: [ fl ]
    -- 0x dfe0 ( 57312 ) :: [ ‡ ]
    -- 0x dfe1 ( 57313 ) :: [ · ]
    -- 0x dfe2 ( 57314 ) :: [ ‚ ]
    -- 0x dfe3 ( 57315 ) :: [ „ ]
    -- 0x dfe4 ( 57316 ) :: [ ‰ ]
    -- 0x dfe5 ( 57317 ) :: [ Â ]
    -- 0x dfe6 ( 57318 ) :: [ Ê ]
    -- 0x dfe7 ( 57319 ) :: [ Á ]
    -- 0x dfe8 ( 57320 ) :: [ Ë ]
    -- 0x dfe9 ( 57321 ) :: [ È ]
    -- 0x dfea ( 57322 ) :: [ Í ]
    -- 0x dfeb ( 57323 ) :: [ Î ]
    -- 0x dfec ( 57324 ) :: [ Ï ]
    -- 0x dfed ( 57325 ) :: [ Ì ]
    -- 0x dfee ( 57326 ) :: [ Ó ]
    -- 0x dfef ( 57327 ) :: [ Ô ]
    -- 0x dff0 ( 57328 ) :: [  ]
    -- 0x dff1 ( 57329 ) :: [ Ò ]
    -- 0x dff2 ( 57330 ) :: [ Ú ]
    -- 0x dff3 ( 57331 ) :: [ Û ]
    -- 0x dff4 ( 57332 ) :: [ Ù ]
    -- 0x dff5 ( 57333 ) :: [ ı ]
    -- 0x dff6 ( 57334 ) :: [ ˆ ]
    -- 0x dff7 ( 57335 ) :: [ ˜ ]
    -- 0x dff8 ( 57336 ) :: [ ¯ ]
    -- 0x dff9 ( 57337 ) :: [ ˘ ]
    -- 0x dffa ( 57338 ) :: [ ˙ ]
    -- 0x dffb ( 57339 ) :: [ ˚ ]
    -- 0x dffc ( 57340 ) :: [ ¸ ]
    -- 0x dffd ( 57341 ) :: [ ˝ ]
    -- 0x dffe ( 57342 ) :: [ ˛ ]
    -- 0x dfff ( 57343 ) :: [ ˇ ]

    { \0 # >NULL-byte }
    { \a \7 # >BEL/alert }
    { \b \10 # >backspc }
    { \t \11 # >h-TAB }
    { \n \12 # >NL line-feed }
    { \v \13 # >v-TAB }
    { \f \14 # >FF form-feed }
    { \r \15 # >\r\n most common }
    { \33 # >ESCAPE; \e awk-invalid }
    { \34 # >SUBSEP-def. }
    { \177 # >DELETE }
    { \37 1F # [:cntrl:] }
    { \36 1E # [:cntrl:] }
    { \35 1D # [:cntrl:] }
    { \32 1A # [:cntrl:] }
    { \31 19 # [:cntrl:] }
    { \30 18 # [:cntrl:] }
    { \27 17 # [:cntrl:] }
    { \26 16 # [:cntrl:] }
    { \25 15 # [:cntrl:] }
    { \24 14 # [:cntrl:] }
    { \23 13 # [:cntrl:] }
    { \22 12 # [:cntrl:] }
    { \21 11 # [:cntrl:] }
    { \20 10 # [:cntrl:] }
    { \17 0F # [:cntrl:] }
    { \16 0E # [:cntrl:] }
    { \06 06 # [:cntrl:] }
    { \05 05 # [:cntrl:] }
    { \04 04 # [:cntrl:] }
    { \03 03 # [:cntrl:] }
    { \02 02 # [:cntrl:] }
    { \01 01 # [:cntrl:] }
    161
    input length( abc
    12%3=숷??:??5@XYZ6 ~~~ 30 bytes


    | [ 1] { 1 } <( "a" )>
    | [ 2] { 1 } <( "b" )>
    | [ 3] { 1 } <( "c" )>
    | [ 4] { 1 } <( { \f \14 # >FF form-feed } )>
    | [ 5] { 1 } <( "1" )>
    | [ 6] { 1 } <( "2" )>
    | [ 7] { 1 } <( "%" )>
    | [ 8] { 1 } <( "3" )>
    | [ 9] { 1 } <( "=" )>
    | [ 10] { 3 } <( "숷" )>
    | [ 11] { 1 } <( { \273 BB # utf-8 cont-byte } )>
    | [ 12] { 1 } <( { \33 # >ESCAPE; \e awk-invalid } )>
    | [ 13] { 1 } <( "+" )>
    | [ 14] { 1 } <( { \300 C0 # utf-8 INVALID } )>
    | [ 15] { 1 } <( { \255 AD # utf-8 cont-byte } )>
    | [ 16] { 1 } <( ":" )>
    | [ 17] { 1 } <( { \374 FC # utf-8 INVALID } )>
    | [ 18] { 1 } <( { \27 17 # [:cntrl:] } )>
    | [ 19] { 1 } <( { \344 E4 # utf-8 3-bytes lead } )>
    | [ 20] { 1 } <( "5" )>
    | [ 21] { 3 } <( "" )>
    | [ 22] { 1 } <( "@" )>
    | [ 23] { 1 } <( "X" )>
    | [ 24] { 1 } <( "Y" )>
    | [ 25] { 1 } <( "Z" )>
    | [ 26] { 1 } <( "6" )>

    .
    _____AFTER_NEW_SPLITTING_____
    .
    .
    .

    | [ 1] { 3 } <( "abc" )>
    | [ 2] { 1 } <( { \f \14 # >FF form-feed } )>
    | [ 3] { 5 } <( "12%3=" )>
    | [ 4] { 1 } <( { \354 EC # utf-8 3-bytes lead } )>
    | [ 5] { 1 } <( { \210 88 # utf-8 cont-byte } )>
    | [ 6] { 1 } <( { \267 B7 # utf-8 cont-byte } )>
    | [ 7] { 1 } <( { \273 BB # utf-8 cont-byte } )>
    | [ 8] { 1 } <( { \33 # >ESCAPE; \e awk-invalid } )>
    | [ 9] { 1 } <( "+" )>
    | [ 10] { 1 } <( { \300 C0 # utf-8 INVALID } )>
    | [ 11] { 1 } <( { \255 AD # utf-8 cont-byte } )>
    | [ 12] { 1 } <( ":" )>
    | [ 13] { 1 } <( { \374 FC # utf-8 INVALID } )>
    | [ 14] { 1 } <( { \27 17 # [:cntrl:] } )>
    | [ 15] { 1 } <( { \344 E4 # utf-8 3-bytes lead } )>
    | [ 16] { 1 } <( "5" )>
    | [ 17] { 1 } <( { \357 EF # utf-8 3-bytes lead } )>
    | [ 18] { 1 } <( { \243 A3 # utf-8 cont-byte } )>
    | [ 19] { 1 } <( { \277 BF # utf-8 cont-byte } )>
    | [ 20] { 5 } <( "@XYZ6" )>

    # gawk profile, created Sat Jun 4 03:08:09 2022

    LC_ALL= LANG="en_US.UTF-8" gawk -d- -p- -e '

    # BEGIN rule(s)

    BEGIN {
    1 print initOct()
    1 _ = ""
    1 _____ = "^[ -~]*$"
    1 print " input length( ",
    (_ = "abc\f12%3=\354\210\267\273\033+"\
    "\300\255:\374\027\3445\357\243\277@XYZ6") (" ~~~ "),
    ((_) ~ (_____)) && (_ ~ /^[ -~]*?/) \
    ? length(_) " utf8 chars" \
    : (match(_, "$") - 1) " bytes\n\n"

    1 split(_, __, "")
    26 for (____ in __) {
    26 printf " | [%3.f] { %3.f } <( %45.45s )> \n", ____,
    ((___ = __[____]) ~ _____) && (___ ~ /^[ -~]*?/) ? length(___) : match(___, "$")-1,
    (___ in octalRE_L) ? octalRE_L[___] : "\"" (___) "\""
    }

    1 printf ("\n .\n _____AFTER_NEW_SPLITTING_____\n .\n .\n .\n\n")

    1 ___ = (_) !~ (___ = "\31") ? (___) : _ !~ ( ___ = "\24\23") ? ___ : "\5\4\32\1"
    1 _____ = _ !~ (_____ = "\25") ? _____ : _ !~ (_____="\35\17") ? _____ : "\26\36\6\16"

    1 gsub("", ("&") ___, _)
    1 gsub("([ -~]" (___) ")+", ("&") ___, _)
    1 gsub("[ -~]", ("&") _____, _)
    1 gsub("^" (___) "|" (_____) (___) "|" (___) "$", "", _)
    1 split(_, __, ___)
    1 _____ = "^[ -~]*$"

    20 for (____ in __) {
    20 printf " | [%3.f] { %3.f } <( %45.45s )> \n", ____,
    (___=__[____]) ~_____ ? length(___) : match(___, "$")-1,
    (___ in octalRE_L) ? octalRE_L[___] : "\"" (___) "\""
    }
    }

    # Functions, listed alphabetically

    1 function initOct(_, __, ___, ____, _____)
    {
    1 __ = -(_ ^= _ < _) + (++_ - ++_ ^ _ + _ ^ ++_) * (_ ^ ++_)
    1 ___ = -(_-- ^ --_ + _)
    1 _ ^= ++_
    1 ++__
    128 do {
    128 ____ = ""
    128 printf " -- 0x %4x ( %5.f ) :: [ %3.1s ]\n", __ + ___, __ + ___, ____ = sprintf("%c", __ + ___)
    128 octalRE_L[____] = \
    sprintf("{ \\%03o %.2X # %-20.25s }", ___ + _, _ + ___,
    ("utf-8 ") ((____ < "\300") ? "cont-byte" \
    : (____ < "\340") ? ((____ < "\302") ? "INVALID" : "2-bytes lead") \
    : (____ < "\365") ? (4 - (____ < "\360")) "-bytes lead" : "INVALID" ))
    128 ____ = ""
    } while (++___ < -___)
    1 OFS = ORS
    1 print "", octalRE_L["\0"] = "{ \\0 # >NULL-byte }",
    octalRE_L["\a"] = "{ \\a \\7 # >BEL/alert }",
    octalRE_L["\b"] = "{ \\b \\10 # >backspc }",
    octalRE_L["\t"] = "{ \\t \\11 # >h-TAB }",
    octalRE_L["\n"] = "{ \\n \\12 # >NL line-feed }",
    octalRE_L["\v"] = "{ \\v \\13 # >v-TAB }",
    octalRE_L["\f"] = "{ \\f \\14 # >FF form-feed }",
    octalRE_L["\r"] = "{ \\r \\15 # >\\r\\n most common }",

    octalRE_L["\33"] = "{ \\33 # >ESCAPE; \\e awk-invalid }",
    octalRE_L["\34"] = "{ \\34 # >SUBSEP-def. }",
    octalRE_L["\177"] = "{ \\177 # >DELETE }"
    1 _ = ""
    1 OFS = FS
    31 for (_ = (_ += _ ^= _ < _) ^ _ - +-(++_ ^ _); _; _--) {
    31 if (! ((__ = sprintf("%c", _)) in octalRE_L)) { # 22
    22 print octalRE_L[__] = sprintf("{ \\%.2o %.2X # [:cntrl:] }", _, _)
    }
    }
    1 _ = ""
    161 for (__ in octalRE_L) {
    161 ++_
    }
    1 return +_
    }

    ARGC: 1
    ARGIND: 0
    ARGV: array, 1 elements
    BINMODE: 0
    CONVFMT: "%.6g"
    ENVIRON: array, 123 elements
    ERRNO: ""
    FIELDWIDTHS: ""
    FILENAME: ""
    FNR: 0
    FPAT: "[^[:space:]]+"
    FS: " "
    FUNCTAB: array, 42 elements
    IGNORECASE: 0
    LINT: 0
    NF: 0
    NR: 0
    OFMT: "%.6g"
    OFS: " "
    ORS: "\n"
    PREC: 53
    PROCINFO: array, 35 elements
    RLENGTH: 0
    ROUNDMODE: "N"
    RS: "\n"
    RSTART: 2
    RT: ""
    SUBSEP: "\034"
    SYMTAB: array, 34 elements
    TEXTDOMAIN: "messages"
    _: "abc\031\f\03112%3=\031\354\031\210\031\267\031\273\031\033\031+\031\300\031\255\031:\031\374\031\027\031\344\0315\031\357\031\243\031\277\031@XYZ6"
    __: array, 20 elements
    ___: "@XYZ6"
    ____: 20
    _____: "^[ -~]*$"
    octalRE_L: array, 161 elements

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)