/* OS/2 REXX: convert UTF-8 in specified file(s) to NCRs. Check */ /* procedure SFDT() for other platforms. (Frank Ellermann, 2006) */ signal on novalue name TRAP ; signal on syntax name TRAP signal on failure name TRAP ; signal on halt name TRAP signal on notready name TRAP ; signal on error name TRAP U.CNV = 0 ; U.ASC = xrange( x2c( '00' ), x2c( '7F' )) U.BAD = 0 ; U.B64 = xrange( x2c( '80' ), x2c( 'BF' )) U.DEC = 1 /* 0: hex. NCRs, 1: dec. NCRs */ FILE = strip( strip( strip( arg( 1 )), /**/, '"' )) if FILE = '' then do parse source . . FILE say 'Usage:' FILE '[file]' say 'Converts UTF-8 in file (wildcard okay) to NCRs,' say 'needs write access on file and temporary bak-file.' say 'UTF-8 conversion test suite running:' exit TEST() end call UTIL 'SysFileTree' if SysFileTree( FILE, 'FILE', 'FO' ) <> 0 then FILE.0 = 0 if FILE.0 = 0 then exit TRAP( FILE 'not found or error' ) do N = 1 to FILE.0 U.BAD = 0 ; U.CNV = 0 /* reset error + cnv counters */ P = lastpos( '.', FILE.N ) - 1 if P < 0 then P = length( FILE.N ) FILE = left( FILE.N, P ) || '.bak' P = lastpos( '/', translate( FILE, '/', '\' )) NAME = substr( FILE, P + 1 ) address CMD '@REN "' || FILE.N || '" "' || NAME || '"' NAME = substr( FILE.N, P + 1 ) do while sign( lines( FILE )) call lineout FILE.N, UNCR( linein( FILE )) end call lineout FILE.N ; call lineout FILE if U.BAD <> 0 | U.CNV = 0 then do address CMD '@DEL "' || FILE.N || '"' address CMD '@REN "' || FILE || '" "' || NAME || '"' if U.BAD <> 0 then say U.BAD 'errors, reset' NAME else say 'found no UTF-8 in' NAME end else do call SFDT FILE, FILE.N address CMD '@DEL "' || FILE || '"' say U.CNV 'UTF-8 converted to NCRs in' NAME end end N exit 0 SFDT: procedure /* keep old file date + time, */ signal on syntax name SFDT.TRAP /* tries Sys?etFileDateTime() */ parse arg SRC, DST /* or any `touch -r` command: */ parse value SysGetFileDateTime( SRC ) with DATE TIME if SysSetFileDateTime( DST, DATE, TIME ) = 0 then return 0 SFDT.TRAP: ; signal on syntax name TRAP signal on error name TRAP ; signal on failure name TRAP if address() = 'CMD' then '@touch -r "' || SRC || '" "' || DST || '"' else 'touch -r "' || SRC || '" "' || DST || '"' return rc /* 0: okay, else throw error */ /* UNCR() copied from */ UNCR: procedure expose U. /* convert UTF-8 text to NCRs */ parse arg SRC ; DST = '' do while SRC <> '' /* skip the rest if US-ASCII: */ LEN = verify( SRC, U.ASC ) -1 ; if LEN < 0 then leave DST = DST || left( SRC, LEN ) ; SRC = substr( SRC, LEN + 1 ) parse var SRC LB 2 SRC ; LB = c2d( LB ) TOP = 0 if SRC \== '' then TOP = c2d( left( SRC, 1 )) % 16 select /* for CESU remove both LB = 237 */ when LB < 192 then LEN = -0 /* trail bytes */ when LB < 194 then LEN = -1 /* bad C0 + C1 */ when LB < 224 then LEN = +1 when LB = 224 & TOP = 8 then LEN = -2 /* E08x is bad */ when LB = 224 & TOP = 9 then LEN = -2 /* E09x is bad */ when LB = 237 & TOP = 10 then LEN = -2 /* EDAx is bad */ when LB = 237 & TOP = 11 then LEN = -2 /* EDBx is bad */ when LB < 240 then LEN = +2 when LB = 240 & TOP = 8 then LEN = -3 /* F08x is bad */ when LB < 244 then LEN = +3 when LB = 244 & TOP = 8 then LEN = +3 /* F48x is ok. */ when LB < 248 then LEN = -3 /* bad F4 - F7 */ when LB < 252 then LEN = -4 /* bad F8 - FB */ when LB < 254 then LEN = -5 /* bad FC + FD */ otherwise LEN = -0 /* bad FE + FF */ end ERR = ( LEN <= 0 ) ; LEN = abs( LEN ) if length( SRC ) < LEN then do ERR = 1 ; LEN = length( SRC ) end TOP = left( SRC, LEN ) ; SRC = substr( SRC, LEN + 1 ) TMP = verify( TOP, U.B64 ) if TMP > 0 then do /* eat plausible trailing bytes: */ ERR = 1 ; SRC = substr( TOP, TMP ) || SRC end /* but keep possible valid input */ if ERR = 0 then do /* at this point input is valid: */ LB = x2b( d2x( LB )) ; LEN = verify( LB, 1 ) - 2 LB = copies( 0, LEN ) || right( LB, 6 - LEN ) do until TOP == '' parse var TOP TMP 2 TOP LB = LB || right( x2b( c2x( TMP )), 6 ) end TOP = b2x( strip( LB, 'L', 0 )) if U.DEC then DST = DST || '&#' || x2d( TOP ) || ';' else DST = DST || '&#x' || TOP || ';' U.CNV = U.CNV + 1 end else do U.BAD = U.BAD + 1 if U.DEC then DST = DST || '�' else DST = DST || '�' end end if U.DEC then do SRC = DST || SRC ; DST = '' TMP = pos( '&#x', SRC ) do while sign( TMP ) DST = DST || left( SRC, TMP + 2 ) SRC = substr( SRC, TMP + 4 ) TMP = pos( ';', SRC ) /* for 0 &#x without ; is BAD */ if TMP > 1 then do /* for 1 &#x; is BAD hex. NCR */ TOP = left( SRC, TMP - 1 ) if datatype( TOP, 'x' ) then do SRC = substr( SRC, TMP + 1 ) DST = DST || x2d( TOP ) TMP = pos( '&#x', SRC ) U.CNV = U.CNV + 1 ; iterate end /* converted another hex. NCR */ end TMP = pos( '&#x', SRC ) ; DST = DST || 'x' U.BAD = U.BAD + 1 /* BAD if &#x was no hex. NCR */ end /* (SGML syntax might differ) */ end return DST || SRC TEST: procedure expose U. /* test some UTF-8 encodings: */ U.DEC = 1 ; ERR = '�' /* test uses tricky dec. NCRs */ if UNCR( U.ASC ) <> U.ASC then exit TRAP( 'US ASCII garbled' ) do N = 128 to 65535 /* skip D800..DFFF surrogates */ STR = '<' UTF8( N ) '>' if x2d( 'D800' ) <= N & N < x2d( 'E000' ) then NCR = '<' ERR '>' else NCR = '< &#' || N || '; >' if UNCR( STR ) <> NCR then exit TRAP( 'base plane at' N ) end N if U.BAD <> 2048 then exit TRAP( 'base plane:' U.BAD 'char.s' ) say 'base plane PASS, testing planes 15 and 16...' if abbrev( 'KEDITW', address()) then 'refresh' do N = x2d( 0F0000 ) to x2d( 10FFFF ) STR = UTF8( N ) NCR = '&#' || N || ';' if UNCR( STR ) <> NCR then exit TRAP( 'plane 15/16 at' N ) end N if U.BAD <> 2048 then exit TRAP( 'plane 15/16:' U.BAD 'char.s' ) say 'planes 0, 15, and 16 PASS, 1..14 not tested' if abbrev( 'KEDITW', address()) then 'refresh' numeric digits 10 ; U.CNV = U.BAD do N = x2d( 110000 ) to x2d( 'FFFF FFFF' ) by 76147 STR = UNCR( UTF8( N )) ; U.CNV = U.CNV + 1 if abbrev( STR, ERR ) = 0 then exit TRAP( 'x'|| d2x( N ) N ) end N if sign( 56389 * 76147 - x2d( 'FFFF FFFF' ) + x2d( '0011 0000' )) then say '56390 encodings above the STD 63 aka RFC 3629 limit' else say 'test suite PASS:' U.CNV '=' 2048+56390 'bad checked' return U.CNV <> 2048 + 56390 UTF8: procedure /* decimal to UTF-8 for TEST: */ SRC = reverse( x2b( d2x( arg( 1 )))) ; DST = '' do LEN = 2 until verify( substr( SRC, 8 - LEN ), 0 ) = 0 DST = DST || left( SRC, 6, 0 ) || '01' SRC = substr( SRC, 7 ) /* encoded 6 bits of scalar */ end LEN /* remaining bits of scalar: */ DST = DST || left( SRC, 7 - LEN, 0 ) || 0 return x2c( b2x( reverse( DST || copies( 1, LEN )))) /* see , (c) F. Ellermann */ UTIL: procedure /* load necessary RexxUtil entry */ if RxFuncQuery( arg( 1 )) then if RxFuncAdd( arg( 1 ), 'RexxUtil', arg( 1 )) then exit TRAP( "can't add RexxUtil" arg( 1 )) return 0 TRAP: /* select REXX exception handler */ call trace 'O' ; trace N /* don't trace interactive */ parse source TRAP /* source on separate line */ TRAP = x2c( 0D ) || right( '+++', 10 ) TRAP || x2c( 0D0A ) TRAP = TRAP || right( '+++', 10 ) /* = standard trace prefix */ TRAP = TRAP strip( condition( 'c' ) 'trap:' condition( 'd' )) select when wordpos( condition( 'c' ), 'ERROR FAILURE' ) > 0 then do if condition( 'd' ) > '' /* need an additional line */ then TRAP = TRAP || x2c( 0D0A ) || right( '+++', 10 ) TRAP = TRAP '(RC' rc || ')' /* any system error codes */ if condition( 'c' ) = 'FAILURE' then rc = -3 end when wordpos( condition( 'c' ), 'HALT SYNTAX' ) > 0 then do if condition( 'c' ) = 'HALT' then rc = 4 if condition( 'd' ) > '' & condition( 'd' ) <> rc then do if condition( 'd' ) <> errortext( rc ) then do TRAP = TRAP || x2c( 0D0A ) || right( '+++', 10 ) TRAP = TRAP errortext( rc ) end /* future condition( 'd' ) */ end /* may use errortext( rc ) */ else TRAP = TRAP errortext( rc ) rc = -rc /* rc < 0: REXX error code */ end when condition( 'c' ) = 'NOVALUE' then rc = -2 /* dubious */ when condition( 'c' ) = 'NOTREADY' then rc = -1 /* dubious */ otherwise /* force non-zero whole rc */ if datatype( value( 'RC' ), 'W' ) = 0 then rc = 1 if rc = 0 then rc = 1 if condition() = '' then TRAP = TRAP arg( 1 ) end /* direct: TRAP( message ) */ TRAP = TRAP || x2c( 0D0A ) || format( sigl, 6 ) signal on syntax name TRAP.SIGL /* throw syntax error 3... */ if 0 < sigl & sigl <= sourceline() /* if no handle for source */ then TRAP = TRAP '*-*' strip( sourceline( sigl )) else TRAP = TRAP '+++ (source line unavailable)' TRAP.SIGL: /* ...catch syntax error 3 */ if abbrev( right( TRAP, 2 + 6 ), x2c( 0D0A )) then do TRAP = TRAP '+++ (source line unreadable)' ; rc = -rc end select when 0 then do /* in pipes STDERR: output */ parse version TRAP.REXX /* REXX/Personal: \dev\con */ if abbrev( TRAP.REXX, 'REXXSAA ' ) | /**/ , 6 <= word( TRAP.REXX, 2 ) then TRAP.REXX = 'STDERR' else TRAP.REXX = '\dev\con' signal on syntax name TRAP.FAIL call lineout TRAP.REXX , TRAP /* fails if no more handle */ end when 0 then do /* OS/2 PM or ooREXX on NT */ signal on syntax name TRAP.FAIL call RxMessageBox translate( TRAP, ' ', x2c( 0D )), /**/ , 'Trap' time(),, 'ERROR' end otherwise say TRAP ; trace ?L /* interactive Label trace */ end if condition() = 'SIGNAL' then signal TRAP.EXIT TRAP.CALL: return rc /* continue after CALL ON */ TRAP.FAIL: say TRAP ; rc = 0 - rc /* force TRAP error output */ TRAP.EXIT: exit rc /* exit for any SIGNAL ON */