/**/ signal on novalue /* force REXX and its way of 'NOVALUE ON' */ /* Usage: KEDIT file (PROFile X-WIKI */ /* MACRO X-WIKI */ /* Purpose: Translate action=raw UTF-8 in MIME text/x-wiki */ /* to XHTML NCRs (numerical character references). */ /* Compatibility: Old Netscape browsers don't support hex. NCRs, */ /* here the default DEC = 1 enforces decimal NCRs. */ /* Test suite: Edit "if 0 then exit TEST()" below and then use */ /* MACRO X-WIKI to start the test suite. Old code */ /* to catch 34 of 66 "non-characters" like u+FFFE */ /* was removed in this version. */ /* Feature: UNCR() can be used as is in other REXX scripts, */ /* the test suite TEST() and UTF8() also work as */ /* is. The required global variables are U.ASC, */ /* U.B64, U.CNV, U.BAD, and U.DEC as shown below. */ /* Optional: INITIAL.KEX if used as profile as in X-WIKI.CMD */ /* Requires: Kedit 5.0 (Frank Ellermann, 2006) */ U.CNV = 0 ; U.ASC = xrange( x2c( '00' ), x2c( '7F' )) U.BAD = 0 ; U.B64 = xrange( x2c( '80' ), x2c( 'BF' )) U.DEC = 1 /* 0: hex. NCRs, 1: dec. NCRs */ if 0 then exit TEST() /* 1: TEST planes 0, 15, 16 */ if profile() then call INIT /* settings for KEDIT profile */ do N = 1 to size.1() /* 'refresh' in INIT got size */ ':' N ; X = curline.3() Y = UNCR( X ) ; if X <> Y then 'r' Y U.BAD = U.BAD + ( rc <> 0 ) /* FATAL, insufficient width */ end N say U.BAD 'errors,' U.CNV 'UTF-8 characters converted to NCRs' ':0 forw half' ; exit U.BAD > 0 /* rc 0: no conversion errors */ INIT: procedure /* for macro used as profile: */ 'editv get INITIAL.0' /* read variable INITIAL.0 */ if INITIAL.0 = '' then do /* if INITIAL.0 undefined: */ 'macro INITIAL' /* use INITIAL.KEX to set */ 'editv get INITIAL.0' /* new INITIAL.0 count of */ end /* unusual local defaults */ do N = 1 to INITIAL.0 /* including initial ATTRs */ 'editv get INITIAL.' || N 'set' INITIAL.N ; if rc <> 0 then exit rc end N 'eofin allow' ; 'eolin lf' ; 'tabsin off' 'eofout eol' ; 'eolout lf' ; 'tabsout off' 'trailing on' ; 'refresh' ; return UNCR: procedure expose U. /* convert UTF-8 text to NCRs */ parse arg SRC ; DST = '' do while SRC <> '' /* skip the rest if US-ASCII: */ LEN = verify( SRC, U.ASC ) -1 ; if LEN < 0 then leave DST = DST || left( SRC, LEN ) ; SRC = substr( SRC, LEN + 1 ) parse var SRC LB 2 SRC ; LB = c2d( LB ) TOP = 0 if SRC \== '' then TOP = c2d( left( SRC, 1 )) % 16 select /* for CESU remove both LB = 237 */ when LB < 192 then LEN = -0 /* trail bytes */ when LB < 194 then LEN = -1 /* bad C0 + C1 */ when LB < 224 then LEN = +1 when LB = 224 & TOP = 8 then LEN = -2 /* E08x is bad */ when LB = 224 & TOP = 9 then LEN = -2 /* E09x is bad */ when LB = 237 & TOP = 10 then LEN = -2 /* EDAx is bad */ when LB = 237 & TOP = 11 then LEN = -2 /* EDBx is bad */ when LB < 240 then LEN = +2 when LB = 240 & TOP = 8 then LEN = -3 /* F08x is bad */ when LB < 244 then LEN = +3 when LB = 244 & TOP = 8 then LEN = +3 /* F48x is ok. */ when LB < 248 then LEN = -3 /* bad F4 - F7 */ when LB < 252 then LEN = -4 /* bad F8 - FB */ when LB < 254 then LEN = -5 /* bad FC + FD */ otherwise LEN = -0 /* bad FE + FF */ end ERR = ( LEN <= 0 ) ; LEN = abs( LEN ) if length( SRC ) < LEN then do ERR = 1 ; LEN = length( SRC ) end TOP = left( SRC, LEN ) ; SRC = substr( SRC, LEN + 1 ) TMP = verify( TOP, U.B64 ) if TMP > 0 then do /* eat plausible trailing bytes: */ ERR = 1 ; SRC = substr( TOP, TMP ) || SRC end /* but keep possible valid input */ if ERR = 0 then do /* at this point input is valid: */ LB = x2b( d2x( LB )) ; LEN = verify( LB, 1 ) - 2 LB = copies( 0, LEN ) || right( LB, 6 - LEN ) do until TOP == '' parse var TOP TMP 2 TOP LB = LB || right( x2b( c2x( TMP )), 6 ) end TOP = b2x( strip( LB, 'L', 0 )) if U.DEC then DST = DST || '&#' || x2d( TOP ) || ';' else DST = DST || '&#x' || TOP || ';' U.CNV = U.CNV + 1 end else do U.BAD = U.BAD + 1 if U.DEC then DST = DST || '�' else DST = DST || '�' end end if U.DEC then do SRC = DST || SRC ; DST = '' TMP = pos( '&#x', SRC ) do while sign( TMP ) DST = DST || left( SRC, TMP + 2 ) SRC = substr( SRC, TMP + 4 ) TMP = pos( ';', SRC ) /* for 0 &#x without ; is BAD */ if TMP > 1 then do /* for 1 &#x; is BAD hex. NCR */ TOP = left( SRC, TMP - 1 ) if datatype( TOP, 'x' ) then do SRC = substr( SRC, TMP + 1 ) DST = DST || x2d( TOP ) TMP = pos( '&#x', SRC ) U.CNV = U.CNV + 1 ; iterate end /* converted another hex. NCR */ end TMP = pos( '&#x', SRC ) ; DST = DST || 'x' U.BAD = U.BAD + 1 /* BAD if &#x was no hex. NCR */ end /* (SGML syntax might differ) */ end return DST || SRC TEST: procedure expose U. /* test some UTF-8 encodings: */ U.DEC = 1 ; ERR = '�' /* test uses tricky dec. NCRs */ if UNCR( U.ASC ) <> U.ASC then exit TRAP( 'US ASCII garbled' ) do N = 128 to 65535 /* skip D800..DFFF surrogates */ STR = '<' UTF8( N ) '>' if x2d( 'D800' ) <= N & N < x2d( 'E000' ) then NCR = '<' ERR '>' else NCR = '< &#' || N || '; >' if UNCR( STR ) <> NCR then exit TRAP( 'base plane at' N ) end N if U.BAD <> 2048 then exit TRAP( 'base plane:' U.BAD 'char.s' ) say 'base plane PASS, testing planes 15 and 16...' if abbrev( 'KEDITW', address()) then 'refresh' do N = x2d( 0F0000 ) to x2d( 10FFFF ) STR = UTF8( N ) NCR = '&#' || N || ';' if UNCR( STR ) <> NCR then exit TRAP( 'plane 15/16 at' N ) end N if U.BAD <> 2048 then exit TRAP( 'plane 15/16:' U.BAD 'char.s' ) say 'planes 0, 15, and 16 PASS, 1..14 not tested' if abbrev( 'KEDITW', address()) then 'refresh' numeric digits 10 ; U.CNV = U.BAD do N = x2d( 110000 ) to x2d( 'FFFF FFFF' ) by 76147 STR = UNCR( UTF8( N )) ; U.CNV = U.CNV + 1 if abbrev( STR, ERR ) = 0 then exit TRAP( 'x'|| d2x( N ) N ) end N if sign( 56389 * 76147 - x2d( 'FFFF FFFF' ) + x2d( '0011 0000' )) then say '56390 encodings above the STD 63 aka RFC 3629 limit' else say 'test suite PASS:' U.CNV '=' 2048+56390 'bad checked' return U.CNV <> 2048 + 56390 UTF8: procedure /* decimal to UTF-8 for TEST: */ SRC = reverse( x2b( d2x( arg( 1 )))) ; DST = '' do LEN = 2 until verify( substr( SRC, 8 - LEN ), 0 ) = 0 DST = DST || left( SRC, 6, 0 ) || '01' SRC = substr( SRC, 7 ) /* encoded 6 bits of scalar */ end LEN /* remaining bits of scalar: */ DST = DST || left( SRC, 7 - LEN, 0 ) || 0 return x2c( b2x( reverse( DST || copies( 1, LEN )))) TRAP: 'emsg assertion failed:' arg( 1 ) ; exit 1 /* my error */