/* OS/2 REXX or NT ooREXX: convert UTF-8 to SBCS and vice versa. */ /* The REXX source is designed for SBCS codepages based on ASCII, */ /* anything else (EBCDIC, DBCS, UTF-7, etc.) won't work. */ /* This is only the code and a small test suite, copy procedures */ /* UTF.I (UTF-8 to local), UTF.O (local to UTF-8), and UTF.8 to a */ /* a script needing UTF-8 conversions. */ /* UTF.I( x, cp ) decodes an UTF-8 string x for codepage cp. */ /* UTF.O( x, cp ) encodes a codepage cp string x into UTF-8. */ /* The 2nd argument cp can be omitted after it was initialized. */ /* UTF. is a global variable exposed by UTF.I() and UTF.O(), it */ /* is reinitialized if a 2nd argument for UTF.I() or UTF.O() does */ /* not match the last used local codepage. */ /* History - see also : */ /* 0.1 - added codepage 437 using */ /* - obvious bug in UTF.I() REXX positional parsing fixed :-( */ /* 0.2 - avoid syntax trap in UTF.I() for invalid UTF.8 strings */ /* - added UTF.I() test cases for nine invalid UTF-8 strings */ /* - moved old tests to procedure DEBUG, two codepages tested */ /* - use OS/2 SysQueryProcessCodePage() directly (+ comments) */ /* 0.3 - 80..BF now "eat" only 1 byte, shown as one unknown char. */ /* - C0..C1 still "eat" 2 bytes, shown as 1 unknown character */ /* - F5..F7 still "eat" 4 bytes (F5..FD illegal for RfC 3629) */ /* - F8..FB still "eat" 5 bytes (F5..FD unused for ISO 10646) */ /* - FC..FD still "eat" 6 bytes (F5..FD allow 2**31 Unicodes) */ /* - FE..FF now "eat" only 1 byte, shown as one unknown char. */ /* - added tests EF BB BF (u+FEFF BOM) and C0 AF (bad 2F '/') */ /* - bad / unknown / unsupported character shown as UTF.? set */ /* by UTF. = '?', any US-ASCII character could be used */ /* 0.4 - bug fix for windows-1252 (OS/2 1004) 8D, 8E, 9D, 9E, 9F */ /* 0.5 - bug fix for invalid u+4000000 encoding as FC84 8080 8080 */ /* etc. only used in */ /* 0.6 - SysQueryProcessCodePage() removed: UTF.I() and UTF.O() */ /* now expect a 2nd argument specifying the local codepage */ /* 0.7 - replaced UTF-8 prose explanation by simple CharMapML */ /* - replaced '?' by ASCII SUB (0x1A) for unmapped char.s */ /* - added Latin-9 and MacRoman; explicit Latin-1, no default */ /* 0.8 - added ibm-878 (KOI8-R) for the Russian OS/2 community */ /* 0.9 - renamed 'MAC' Roman to '10000' (the number used on W2K) */ /* added '28591' as alias of '819' for ISO 8859-1 */ /* added '28605' as alias of '923' for ISO 8859-15 */ /* Various not yet supported W2K codepages to complete the */ /* already implemented Latin-1 and Cyrillic variants, plus */ /* some obscure W2K codepages noted here "while I'm at it": */ /* 855: OEM Cyrillic */ /* 866: OEM Russian */ /* 1251: ANSI Cyrillic, presumably 28595 excl. C1 controls */ /* 10017: MAC Cyrillic */ /* 28593: IS0 8859-3 (Latin-3, Esperanto) */ /* 28595: IS0 8859-5 (Cyrillic) */ /* 28599: IS0 8859-9 (Latin-5) */ /* 65001: UTF-8 ToDo: find IBM UTF-8 codepage number */ /* 20127: US-ASCII ToDo: figure out what US-ASCII is... */ /* 20105: IA5 IRV ToDo: allow pure 7bit US-ASCII input */ /* 20106: IA5 German (out of scope, noted for reference) */ /* 20261: T.61 ToDo: what is this ? */ /* 20269: ISO 6937 non-spacing accent (out of scope) */ /* 21027: Ext Alpha lower case ToDo: what is this ? */ /* -------------------------------------------------------------- -------------------------------------------------------------- */ signal on novalue name TRAP ; signal on syntax name TRAP signal on failure name TRAP ; signal on halt name TRAP if UTF.O( /**/, 819 ) \== '' then exit TRAP( 'init. Latin-1' ) U = x2c( 77 66 55 44 33 22 ) /* up to 5 char.s "eaten" by */ do N = 0 to 8 /* test invalid UTF-8 strings */ C = x2c( 22 || b2x( left( copies( 1, N ), 8, 0 ))) || U if N = 0 then C = x2c( '22 EF BB BF 22 C0 AF 22' ) say 'bad UTF-8' c2x( C ) '=>' c2x( UTF.I( C )) UTF.I( C ) end N Q = '437 858 1252 819 923 878 10000' do W = 1 to words( Q ) CP = word( Q, W ) select when CP = 437 then P = '( US PC DOS) 437:' when CP = 858 then P = '( OS/2 850) 858:' when CP = 1252 then P = '( OS/2 1004) 1252:' when CP = 819 then P = '(ISO 8859-1) 819:' when CP = 923 then P = '(ISO 8859-15) 923:' when CP = 878 then P = '( KOI8-R ) 878:' when CP = 10000 then P = '(MAC Roman ) 10000:' otherwise P = right( CP, 18 ) || ':' end say P DEBUG( CP ) end W exit 0 DEBUG: procedure do N = 0 to 255 /* check 256 local characters */ C = centre( d2c( N ), 3 ) ; U = UTF.O( C, arg( 1 )) if UTF.I( U ) == C then iterate N say 'error at' N ; trace ?R U = UTF.O( C ) ; call UTF.I U say result == C ; return 'fail' end N U = 128 /* find 128 UTF-8 characters: */ do N = U to 65535 until U = 256 B = reverse( x2b( d2x( N ))) ; C = '' do L = 2 until verify( substr( B, 8 - L ), 0 ) = 0 C = C || left( B, 6, 0 ) || 01 B = substr( B, 7 ) end L B = C || left( B, 8 - L, 0 ) || copies( 1, L ) C = x2c( b2x( reverse( B ))) U = U + ( UTF.I( C ) <> UTF.? ) end N /* test error character UTF.? */ N = 'found' U 'of 256 SBCS characters up to u+' || d2x( N, 4 ) if U = 256 then return 'okay,' N else return 'fail,' N /* -------------------------------------------------------------- */ /* 0.8, (c) F.Ellermann */ UTF.I: procedure expose UTF. /* UTF-8 to local charset */ parse arg SRC ; DST = '' ; UTF.8 = UTF.8( arg( 2 )) do while SRC <> '' POS = verify( SRC, UTF.8 ) -1 ; if POS < 0 then leave DST = DST || left( SRC, POS ) ; SRC = substr( SRC, POS + 1 ) POS = verify( x2b( c2x( left( SRC, 1 ))), 1 ) -1 if POS > 1 & POS < 7 then do /* C0..FD introduce 2-6 bytes */ TOP = left( SRC, POS ) ; SRC = substr( SRC, POS + 1 ) DST = DST || UTF.TOP /* surrogates implicitly bad, */ end /* C0..C1 are implicitly bad, */ else do /* 80..BF and FE..FF illegal: */ DST = DST || UTF.? ; SRC = substr( SRC, 2 ) end /* show error character UTF.? */ end return DST || SRC UTF.O: procedure expose UTF. /* local charset to UTF-8 */ parse arg SRC ; DST = '' ; UTF.8 = UTF.8( arg( 2 )) do while SRC <> '' POS = verify( SRC, UTF.8 ) -1 ; if POS < 0 then leave DST = DST || left( SRC, POS ) ; SRC = substr( SRC, POS + 1 ) parse var SRC TOP 2 SRC ; DST = DST || UTF.TOP end return DST || SRC UTF.8: procedure expose UTF. /* initialize Unicode table */ arg PAGE select when PAGE = value( 'UTF..' ) then nop when PAGE = '' & symbol( 'UTF..' ) = 'VAR' then nop otherwise if symbol( 'UTF.?' ) = 'VAR' then T = UTF.? else T = x2c( 1A ) drop UTF. ; UTF. = T /* SUB unknown char.s by 0x1A */ UTF.. = PAGE ; T = '' /* note actual codepage UTF.. */ select /* -------------------------- */ when PAGE = 437 then do /* US OEM DOS */ T = T ' C7 FC E9 E2 E4 E0 E5 E7' /* 80 */ T = T ' EA EB E8 EF EE EC C4 C5' /* 88 */ T = T ' C9 E6 C6 F4 F6 F2 FB F9' /* 90 */ T = T ' FF D6 DC A2 A3 A5 20A7 192' /* 98 */ T = T ' E1 ED F3 FA F1 D1 AA BA' /* A0 */ T = T ' BF 2310 AC BD BC A1 AB BB' /* A8 */ T = T '2591 2592 2593 2502 2524 2561 2562 2556' /* B0 */ T = T '2555 2563 2551 2557 255D 255C 255B 2510' /* B8 */ T = T '2514 2534 252C 251C 2500 253C 255E 255F' /* C0 */ T = T '255A 2554 2569 2566 2560 2550 256C 2567' /* C8 */ T = T '2568 2564 2565 2559 2558 2552 2553 256B' /* D0 */ T = T '256A 2518 250C 2588 2584 258C 2590 2580' /* D8 */ T = T ' 3B1 DF 393 3C0 3A3 3C3 B5 3C4' /* E0 */ T = T ' 3A6 398 3A9 3B4 221E 3C6 3B5 2229' /* E8 */ T = T '2261 B1 2265 2264 2320 2321 F7 2248' /* F0 */ T = T ' B0 2219 B7 221A 207F B2 25A0 A0' /* F8 */ end /* -------------------------- */ when PAGE = 858 | PAGE = 850 then do /* western DOS */ T = T ' C7 FC E9 E2 E4 E0 E5 E7' /* 80 */ T = T ' EA EB E8 EF EE EC C4 C5' /* 88 */ T = T ' C9 E6 C6 F4 F6 F2 FB F9' /* 90 */ T = T ' FF D6 DC F8 A3 D8 D7 192' /* 98 */ T = T ' E1 ED F3 FA F1 D1 AA BA' /* A0 */ T = T ' BF AE AC BD BC A1 AB BB' /* A8 */ T = T '2591 2592 2593 2502 2524 C1 C2 C0' /* B0 */ T = T ' A9 2563 2551 2557 255D A2 A5 2510' /* B8 */ T = T '2514 2534 252C 251C 2500 253C E3 C3' /* C0 */ T = T '255A 2554 2569 2566 2560 2550 256C A4' /* C8 */ T = T ' F0 D0 CA CB C8 20AC CD CE' /* D0 */ T = T ' CF 2518 250C 2588 2584 A6 CC 2580' /* D8 */ T = T ' D3 DF D4 D2 F5 D5 B5 FE' /* E0 */ T = T ' DE DA DB D9 FD DD AF B4' /* E8 */ T = T ' AD B1 2017 BE B6 A7 F7 B8' /* F0 */ T = T ' B0 A8 B7 B9 B3 B2 25A0 A0' /* F8 */ /* 0xD5 850: u+0131 small dotless i, 858: u+20AC Euro */ end /* -------------------------- */ when PAGE = 819 | PAGE = 28591 then do /* ISO 8859-1 */ do N = 128 to 255 ; T = T d2x( N ) ; end N /* 80-FF */ end /* -------------------------- */ when PAGE = 923 | PAGE = 28605 then do /* ISO 8859-15 */ do N = 128 to 159 ; T = T d2x( N ) ; end N /* 80-9F */ T = T ' A0 A1 A2 A3 20AC A5 160 A7' /* A0 */ T = T ' 161 A9 AA AB AC AD AE AF' /* A8 */ T = T ' B0 B1 B2 B3 17D B5 B6 B7' /* B0 */ T = T ' 17E B9 BA BB 152 153 178 BF' /* B8 */ do N = 192 to 255 ; T = T d2x( N ) ; end N /* C0-FF */ end /* -------------------------- */ when PAGE = 1252 | PAGE = 1004 then do /* OEM Latin-1 */ T = T '20AC 81 201A 192 201E 2026 2020 2021' /* 80 */ T = T ' 2C6 2030 160 2039 152 8D 17D 8F' /* 88 */ T = T ' 90 2018 2019 201C 201D 2022 2013 2014' /* 90 */ T = T ' 2DC 2122 161 203A 153 9D 17E 17F' /* 98 */ do N = 160 to 255 ; T = T d2x( N ) ; end N /* A0-FF */ end /* -------------------------- */ when PAGE = 878 then do /* KOI8-R (ibm-878) */ T = T '2500 2502 250C 2510 2514 2518 251C 2524' /* 80 */ T = T '252C 2534 253C 2580 2584 2588 258C 2590' /* 88 */ T = T '2591 2592 2593 2320 25A0 2219 221A 2248' /* 90 */ T = T '2264 2265 A0 2321 B0 B2 B7 F7' /* 98 */ T = T '2550 2551 2552 451 2553 2554 2555 2556' /* A0 */ T = T '2557 2558 2559 255A 255B 255C 255D 255E' /* A8 */ T = T '255F 2560 2561 401 2562 2563 2564 2565' /* B0 */ T = T '2566 2567 2568 2569 256A 256B 256C A9' /* B8 */ T = T ' 44E 430 431 446 434 435 444 433' /* C0 */ T = T ' 445 438 439 43A 43B 43C 43D 43E' /* C8 */ T = T ' 43F 44F 440 441 442 443 436 432' /* D0 */ T = T ' 44C 44B 437 448 44D 449 447 44A' /* D8 */ T = T ' 42E 410 411 426 414 415 424 413' /* E0 */ T = T ' 425 418 419 41A 41B 41C 41D 41E' /* E8 */ T = T ' 41F 42F 420 421 422 423 416 412' /* F0 */ T = T ' 42C 42B 417 428 42D 429 427 42A' /* F8 */ end /* -------------------------- */ when PAGE = '10000' then do /* MAC Roman */ T = T ' C4 C5 C7 C9 D1 D6 DC E1' /* 80 */ T = T ' E0 E2 E4 E3 E5 E7 E9 E8' /* 88 */ T = T ' EA EB ED EC EE EF F1 F3' /* 90 */ T = T ' F2 F4 F6 F5 FA F9 FB FC' /* 98 */ T = T '2020 B0 A2 A3 A7 2022 B6 DF' /* A0 */ T = T ' AE A9 2122 B4 A8 2260 C6 D8' /* A8 */ T = T '221E B1 2264 2265 A5 B5 2202 2211' /* B0 */ T = T '220F 3C0 222B AA BA 3A9 E6 F8' /* B8 */ T = T ' BF A1 AC 221A 192 2248 2206 AB' /* C0 */ T = T ' BB 2026 A0 C0 C3 D5 152 153' /* C8 */ T = T '2013 2014 201C 201D 2018 2019 F7 25CA' /* D0 */ T = T ' FF 178 2044 20AC 2039 203A FB01 FB02' /* D8 */ T = T '2021 B7 201A 201E 2030 C2 CA C1' /* E0 */ T = T ' CB C8 CD CE CF CC D3 D4' /* E8 */ T = T 'F8FF D2 DA DB D9 131 2C6 2DC' /* F0 */ T = T ' AF 2D8 2D9 2DA B8 2DD 2DB 2C7' /* F8 */ /* 0xBD old u+2126 Ohm : new u+03A9 Omega */ /* 0xDB old u+00A4 currency symbol : new u+20AC Euro */ /* 0xF0 old u+2665 black heart suit: new u+F8FF priv. */ end /* -------------------------- */ end /* otherwise force REXX error */ do N = 128 to 255 /* table of UTF-8 characters: */ parse var T SRC T ; DST = '' SRC = reverse( x2b( SRC )) /* scalar bits right to left */ do LEN = 2 until verify( substr( SRC, 8 - LEN ), 0 ) = 0 DST = DST || left( SRC, 6, 0 ) || '01' SRC = substr( SRC, 7 ) /* encoded 6 bits of scalar */ end LEN /* remaining bits of scalar: */ DST = DST || left( SRC, 7 - LEN, 0 ) || 0 DST = x2c( b2x( reverse( DST || copies( 1, LEN )))) SRC = d2c( N ) /* SRC: 1 byte (local char.) */ UTF.DST = SRC /* DST: 2 or more UTF-8 bytes */ UTF.SRC = DST /* excluding us-ascii 0..127 */ end N end return xrange( x2c( 0 ), x2c( 7F )) /* see , (c) F. Ellermann */ UTIL: procedure /* load necessary RexxUtil entry */ if RxFuncQuery( arg( 1 )) then if RxFuncAdd( arg( 1 ), 'RexxUtil', arg( 1 )) then exit TRAP( "can't add RexxUtil" arg( 1 )) return 0 TRAP: /* select REXX exception handler */ call trace 'O' ; trace N /* don't trace interactive */ parse source TRAP /* source on separate line */ TRAP = x2c( 0D ) || right( '+++', 10 ) TRAP || x2c( 0D0A ) TRAP = TRAP || right( '+++', 10 ) /* = standard trace prefix */ TRAP = TRAP strip( condition( 'c' ) 'trap:' condition( 'd' )) select when wordpos( condition( 'c' ), 'ERROR FAILURE' ) > 0 then do if condition( 'd' ) > '' /* need an additional line */ then TRAP = TRAP || x2c( 0D0A ) || right( '+++', 10 ) TRAP = TRAP '(RC' rc || ')' /* any system error codes */ if condition( 'c' ) = 'FAILURE' then rc = -3 end when wordpos( condition( 'c' ), 'HALT SYNTAX' ) > 0 then do if condition( 'c' ) = 'HALT' then rc = 4 if condition( 'd' ) > '' & condition( 'd' ) <> rc then do if condition( 'd' ) <> errortext( rc ) then do TRAP = TRAP || x2c( 0D0A ) || right( '+++', 10 ) TRAP = TRAP errortext( rc ) end /* future condition( 'd' ) */ end /* may use errortext( rc ) */ else TRAP = TRAP errortext( rc ) rc = -rc /* rc < 0: REXX error code */ end when condition( 'c' ) = 'NOVALUE' then rc = -2 /* dubious */ when condition( 'c' ) = 'NOTREADY' then rc = -1 /* dubious */ otherwise /* force non-zero whole rc */ if datatype( value( 'RC' ), 'W' ) = 0 then rc = 1 if rc = 0 then rc = 1 if condition() = '' then TRAP = TRAP arg( 1 ) end /* direct: TRAP( message ) */ TRAP = TRAP || x2c( 0D0A ) || format( sigl, 6 ) signal on syntax name TRAP.SIGL /* throw syntax error 3... */ if 0 < sigl & sigl <= sourceline() /* if no handle for source */ then TRAP = TRAP '*-*' strip( sourceline( sigl )) else TRAP = TRAP '+++ (source line unavailable)' TRAP.SIGL: /* ...catch syntax error 3 */ if abbrev( right( TRAP, 2 + 6 ), x2c( 0D0A )) then do TRAP = TRAP '+++ (source line unreadable)' ; rc = -rc end select when 0 then do /* in pipes STDERR: output */ parse version TRAP.REXX /* REXX/Personal: \dev\con */ if abbrev( TRAP.REXX, 'REXXSAA ' ) | /**/ , 6 <= word( TRAP.REXX, 2 ) then TRAP.REXX = 'STDERR' else TRAP.REXX = '\dev\con' signal on syntax name TRAP.FAIL call lineout TRAP.REXX , TRAP /* fails if no more handle */ end when 0 then do /* OS/2 PM or ooREXX on NT */ signal on syntax name TRAP.FAIL call RxMessageBox translate( TRAP, ' ', x2c( 0D )), /**/ , 'Trap' time(),, 'ERROR' end otherwise say TRAP ; trace ?L /* interactive Label trace */ end if condition() = 'SIGNAL' then signal TRAP.EXIT TRAP.CALL: return rc /* continue after CALL ON */ TRAP.FAIL: say TRAP ; rc = 0 - rc /* force TRAP error output */ TRAP.EXIT: exit rc /* exit for any SIGNAL ON */