-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbocu.rex
812 lines (739 loc) · 35.3 KB
/
bocu.rex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
/* OS/2 REXX or NT ooREXX: check some UCS transformation formats */
/* Magic numbers : */
/* 56613888 : CP = 1024 * c2d( HI ) + c2d( LO ) - 56613888 */
/* 1114111 : u+10FFFF, last code point in plane 16 */
/* 233006 : u+038E2E, first UTF-1 using five octets */
/* 187660 : BOCU-1 difference with three trailing octets */
/* 65533 : u+FFFD, replacement char. for input error */
/* 55296 : u+D800, 1st UTF-16 high (leading) surrogate */
/* 56320 : u+DC00, 1st UTF-16 low (trailing) surrogate */
/* 16406 : u+4016, 1st UTF-1 using three octets */
/* 10513 : BOCU-1 difference with two trailing octets */
/* 1024 : 2 ** 10 high/low surrogates (2 ** 20 pairs) */
/* 243 : base for trailing BOCU-1 octets (0..3) */
/* 190 : base for trailing UTF-1 octets (0..2, or 4) */
/* 64 : base for trailing UTF-8 octets (0..3) */
/* 16 : base for trailing UTF-4 octets (0..6) */
/* UTF procedures : */
/* UTF.8I, UTF.8O : UTF-8 string to UTF-16BE, or vice versa */
/* UTF.7I, UTF.7O : UTF-7 string to UTF-16BE, or vice versa */
/* UTF.4I, UTF.4O : UTF-4 string to UTF-16BE, or vice versa */
/* UTF.1I, UTF.1O : UTF-1 string to UTF-16BE, or vice versa */
/* BOCU.I, BOCU.O : BOCU-1 string to UTF-16BE, or vice versa */
/* BOCU.5 : BOCU-1 helper (specified rules 5a, 5b, 5c) */
/* UTF.32 : Internal UTF-32BE to UTF-16BE conversion */
/* Test procedures: */
/* UCS.4 : Check UCS-4 code points (17 * 65536 - 2048) */
/* Legacy : Check code pages 819, 858, 923, 1252, etc. */
/* STIR : Used to determine average BOCU-1 compression */
/* CHECK : Check S = decode( encode( S )), show results */
signal on novalue
TEST. = 0 ; TEST.0.1 = 'UTF-16 '
TEST.0.2 = 'UTF-8 ' ; TEST.0.3 = 'UTF-7 '
TEST.0.4 = 'UTF-4 ' ; TEST.0.5 = 'UTF-1 '
TEST.0.6 = 'BOCU-1 ' ; TEST.0.7 = 'reserved'
if 1 then call UCS.4 /* code points, excl. surrogates */
if 0 then call LEGACY /* codepage 437, 858, 1252, etc. */
if symbol( 'RESULT' ) = 'LIT' then say 'edit "if 0" for a test'
exit 0
UCS.4: procedure expose TEST. /* test 2**21 -2048 code points, */
N = 0 /* CAVEAT, this takes some time: */
if arg( 1, 'e' ) then do ; N = arg( 1 ) ; trace ?R ; end
do N = N to x2d( '10FFFF' )
if x2d( 'D800' ) <= N & N < x2d( 'E000' ) then iterate N
if N <= x2d( 'FFFF' )
then SRC = x2c( d2x( N, 4 ))
else SRC = UTF.32( x2c( d2x( N, 8 )))
TEST.1.1 = TEST.1.1 + length( SRC )
DST = UTF.8O( SRC ) ; if SRC <> UTF.8I( DST ) then leave N
TEST.1.2 = TEST.1.2 + length( DST )
DST = UTF.7O( SRC ) ; if SRC <> UTF.7I( DST ) then leave N
TEST.1.3 = TEST.1.3 + length( DST )
DST = UTF.4O( SRC ) ; if SRC <> UTF.4I( DST ) then leave N
TEST.1.4 = TEST.1.4 + length( DST )
DST = UTF.1O( SRC ) ; if SRC <> UTF.1I( DST ) then leave N
TEST.1.5 = TEST.1.5 + length( DST )
DST = BOCU.O( SRC ) ; if SRC <> BOCU.I( DST ) then leave N
TEST.1.6 = TEST.1.6 + length( DST )
call charout 'stderr', d2c( 13 ) || N || d2c( 13 )
end N
if N = x2d( '110000' ) then do N = 1 to 6
say TEST.0.N right( TEST.1.N, 10 )
end
else say ' FAIL u+' || right( d2x( N ), 6, 0 )
return 0
LEGACY: procedure expose TEST. /* test various legacy codepages */
S.0 = 0020 ; E.0 = 007E ; S.1 = 00A0 ; E.1 = 0113
S.2 = 0116 ; E.2 = 012B ; S.3 = 012E ; E.3 = 014D
S.4 = 0150 ; E.4 = 017E ; S.5 = 02C7 ; E.5 = 02C7
S.6 = 02D8 ; E.6 = 02DB ; S.7 = 02DD ; E.7 = 02DD
S.8 = 2015 ; E.8 = 2015 ; S.9 = 2018 ; E.9 = 2019
S.10 = 201C ; E.10 = 201D ; S.11 = 20AC ; E.11 = 20AC
S.12 = 2122 ; E.12 = 2122 ; S.13 = 2126 ; E.13 = 2126
S.14 = 215B ; E.14 = 215E ; S.15 = 2190 ; E.15 = 2193
S.16 = 266A ; E.16 = 266A ; MES = ''
SUM. = 0
do N = 0 to 16
do U = x2d( S.N ) to x2d( E.N )
MES = MES || x2c( d2x( U, 4 ))
SUM.0 = SUM.0 + 1
end U
end N
BOM = 'FEFF'x ; TEST.2.0 = 'MES-1'
SRC = BOM || MES ; TEST.2.1 = length( SRC )
SUM.1 = TEST.2.1 ; TEST.1.1 = left( SRC, 2 )
if CHECK( 2, 1, SRC, /* == */ SRC ) then exit 1
DST = UTF.8O( SRC ) ; TEST.2.2 = length( DST )
SUM.2 = TEST.2.2 ; TEST.1.2 = left( DST, 3 )
if CHECK( 2, 2, SRC, UTF.8I( DST )) then exit 1
DST = UTF.7O( SRC ) ; TEST.2.3 = length( DST )
SUM.3 = TEST.2.3 ; TEST.1.3 = left( DST, 4 )
if CHECK( 2, 3, SRC, UTF.7I( DST )) then exit 1
DST = UTF.4O( SRC ) ; TEST.2.4 = length( DST )
SUM.4 = TEST.2.4 ; TEST.1.4 = left( DST, 5 )
if CHECK( 2, 4, SRC, UTF.4I( DST )) then exit 1
DST = UTF.1O( SRC ) ; TEST.2.5 = length( DST )
SUM.5 = TEST.2.5 ; TEST.1.5 = left( DST, 3 )
if CHECK( 2, 5, SRC, UTF.1I( DST )) then exit 1
AVG = 0
do N = 1 to SUM.0 /* ----------------------------- */
call charout 'stderr', 'BOCU-1' N || d2c( 13 )
SRC = BOM || STIR( MES, N )
DST = BOCU.O( SRC )
if SRC == BOCU.I( DST )
then AVG = AVG + length( DST )
else exit CHECK( 2, 6, SRC, BOCU.I( DST ))
end N
TEST.2.6 = format( AVG / N,, 0 )
TEST.1.6 = left( DST, 3 ) ; SUM.6 = TEST.2.6
call CHECK 2, 6 /* ----------------------------- */
TEST = 'n/a n/a 437 819 858 923 1252 1257 10000'
do CASE = 3 to words( TEST )
PAGE = word( TEST, CASE ) ; TEST.CASE.0 = PAGE
SRC = '' ; T = ''
say ; call CHECK CASE, 0
do N = 0 to 127 ; SRC = SRC || x2c( d2x( N, 4 )) ; end N
select
when PAGE = 819 | PAGE = 28591 then do /* ISO Latin 1 */
do N = 128 to 255 ; T = T d2x( N ) ; end N /* 80-FF */
end /* -------------------------- */
when PAGE = 923 | PAGE = 28605 then do /* ISO Latin 9 */
do N = 128 to 159 ; T = T d2x( N ) ; end N /* 80-9F */
T = T ' A0 A1 A2 A3 20AC A5 160 A7' /* A0 */
T = T ' 161 A9 AA AB AC AD AE AF' /* A8 */
T = T ' B0 B1 B2 B3 17D B5 B6 B7' /* B0 */
T = T ' 17E B9 BA BB 152 153 178 BF' /* B8 */
do N = 192 to 255 ; T = T d2x( N ) ; end N /* C0-FF */
end /* -------------------------- */
when PAGE = 1252 then do /* windows variant of Latin-1 */
T = T '20AC 81 201A 192 201E 2026 2020 2021' /* 80 */
T = T ' 2C6 2030 160 2039 152 86 17D 88' /* 88 */
T = T ' 90 2018 2019 201C 201D 2022 2013 2014' /* 90 */
T = T ' 2DC 2122 161 203A 153 96 17E 17F' /* 98 */
do N = 160 to 255 ; T = T d2x( N ) ; end N /* A0-FF */
end /* -------------------------- */
when PAGE = 437 then do /* US OEM DOS */
T = T ' C7 FC E9 E2 E4 E0 E5 E7' /* 80 */
T = T ' EA EB E8 EF EE EC C4 C5' /* 88 */
T = T ' C9 E6 C6 F4 F6 F2 FB F9' /* 90 */
T = T ' FF D6 DC A2 A3 A5 20A7 192' /* 98 */
T = T ' E1 ED F3 FA F1 D1 AA BA' /* A0 */
T = T ' BF 2310 AC BD BC A1 AB BB' /* A8 */
T = T '2591 2592 2593 2502 2524 2561 2562 2556' /* B0 */
T = T '2555 2563 2551 2557 255D 255C 255B 2510' /* B8 */
T = T '2514 2534 252C 251C 2500 253C 255E 255F' /* C0 */
T = T '255A 2554 2569 2566 2560 2550 256C 2567' /* C8 */
T = T '2568 2564 2565 2559 2558 2552 2553 256B' /* D0 */
T = T '256A 2518 250C 2588 2584 258C 2590 2580' /* D8 */
T = T ' 3B1 DF 393 3C0 3A3 3C3 B5 3C4' /* E0 */
T = T ' 3A6 398 3A9 3B4 221E 3C6 3B5 2229' /* E8 */
T = T '2261 B1 2265 2264 2320 2321 F7 2248' /* F0 */
T = T ' B0 2219 B7 221A 207F B2 25A0 A0' /* F8 */
end /* -------------------------- */
when PAGE = 858 then do /* PC-multilingual-850+euro */
T = T ' C7 FC E9 E2 E4 E0 E5 E7' /* 80 */
T = T ' EA EB E8 EF EE EC C4 C5' /* 88 */
T = T ' C9 E6 C6 F4 F6 F2 FB F9' /* 90 */
T = T ' FF D6 DC F8 A3 D8 D7 192' /* 98 */
T = T ' E1 ED F3 FA F1 D1 AA BA' /* A0 */
T = T ' BF AE AC BD BC A1 AB BB' /* A8 */
T = T '2591 2592 2593 2502 2524 C1 C2 C0' /* B0 */
T = T ' A9 2563 2551 2557 255D A2 A5 2510' /* B8 */
T = T '2514 2534 252C 251C 2500 253C E3 C3' /* C0 */
T = T '255A 2554 2569 2566 2560 2550 256C A4' /* C8 */
T = T ' F0 D0 CA CB C8 20AC CD CE' /* D0 */
T = T ' CF 2518 250C 2588 2584 A6 CC 2580' /* D8 */
T = T ' D3 DF D4 D2 F5 D5 B5 FE' /* E0 */
T = T ' DE DA DB D9 FD DD AF B4' /* E8 */
T = T ' AD B1 2017 BE B6 A7 F7 B8' /* F0 */
T = T ' B0 A8 B7 B9 B3 B2 25A0 A0' /* F8 */
/* 0xD5 850: u+0131 small dotless i, 858: u+20AC Euro */
end /* -------------------------- */
when PAGE = 1257 then do /* windows variant of Latin-4 */
T = T '20AC 81 201A 83 201E 2026 2020 2021' /* 80 */
T = T ' 88 2030 8A 2039 8C A8 2C7 B8' /* 88 */
T = T ' 90 2018 2019 201C 201D 2022 2013 2014' /* 90 */
T = T ' 98 2122 9A 203A 9C AF 2DB 9F' /* 98 */
T = T ' A0 A1 A2 A3 A4 A5 A6 A7' /* A0 */
T = T ' D8 A9 156 AB AC AD AE C6' /* A8 */
T = T ' B0 B1 B2 B3 B4 B5 B6 B7' /* B0 */
T = T ' F8 B9 157 BB BC BD BE E6' /* B8 */
T = T ' 104 12E 100 106 C4 C5 118 112' /* C0 */
T = T ' 10C C9 179 116 122 136 12A 13B' /* C8 */
T = T ' 160 143 145 D3 14C D5 D6 D7' /* D0 */
T = T ' 172 141 15A 16A DC 17B 17D DF' /* D8 */
T = T ' 105 12F 101 107 E4 E5 119 113' /* E0 */
T = T ' 10D E9 17A 117 123 137 12B 13C' /* E8 */
T = T ' 161 144 146 F3 14D F5 F6 F7' /* F0 */
T = T ' 173 142 15B 16B FC 17C 17E 2D9' /* F8 */
end /* -------------------------- */
when PAGE = 10000 then do /* Mac Roman (Euro version) */
T = T ' C4 C5 C7 C9 D1 D6 DC E1' /* 80 */
T = T ' E0 E2 E4 E3 E5 E7 E9 E8' /* 88 */
T = T ' EA EB ED EC EE EF F1 F3' /* 90 */
T = T ' F2 F4 F6 F5 FA F9 FB FC' /* 98 */
T = T '2020 B0 A2 A3 A7 2022 B6 DF' /* A0 */
T = T ' AE A9 2122 B4 A8 2260 C6 D8' /* A8 */
T = T '221E B1 2264 2265 A5 B5 2202 2211' /* B0 */
T = T '220F 3C0 222B AA BA 3A9 E6 F8' /* B8 */
T = T ' BF A1 AC 221A 192 2248 2206 AB' /* C0 */
T = T ' BB 2026 A0 C0 C3 D5 152 153' /* C8 */
T = T '2013 2014 201C 201D 2018 2019 F7 25CA' /* D0 */
T = T ' FF 178 2044 20AC 2039 203A FB01 FB02' /* D8 */
T = T '2021 B7 201A 201E 2030 C2 CA C1' /* E0 */
T = T ' CB C8 CD CE CF CC D3 D4' /* E8 */
T = T 'F8FF D2 DA DB D9 131 2C6 2DC' /* F0 */
T = T ' AF 2D8 2D9 2DA B8 2DD 2DB 2C7' /* F8 */
/* 0xBD old u+2126 Ohm : new u+03A9 Omega */
/* 0xDB old u+00A4 currency symbol : new u+20AC Euro */
/* 0xF0 old u+2665 black heart suit: new u+F8FF priv. */
end /* -------------------------- */
end /* otherwise raise REXX error 40 */
LEN = words( T ) ; AVG = 0
do N = 1 to LEN
SRC = SRC || x2c( right( word( T, N ), 4, 0 ))
end N
SUM.0 = SUM.0 + LEN ; TEST.CASE.1 = length( SRC )
if CHECK( CASE, 1, SRC, /* == */ SRC ) then exit 1
SUM.1 = SUM.1 + TEST.CASE.1
DST = UTF.8O( SRC ) ; TEST.CASE.2 = length( DST )
if CHECK( CASE, 2, SRC, UTF.8I( DST )) then exit 1
SUM.2 = SUM.2 + TEST.CASE.2
DST = UTF.7O( SRC ) ; TEST.CASE.3 = length( DST )
if CHECK( CASE, 3, SRC, UTF.7I( DST )) then exit 1
SUM.3 = SUM.3 + TEST.CASE.3
DST = UTF.4O( SRC ) ; TEST.CASE.4 = length( DST )
if CHECK( CASE, 4, SRC, UTF.4I( DST )) then exit 1
SUM.4 = SUM.4 + TEST.CASE.4
DST = UTF.1O( SRC ) ; TEST.CASE.5 = length( DST )
if CHECK( CASE, 5, SRC, UTF.1I( DST )) then exit 1
SUM.5 = SUM.5 + TEST.CASE.5
do N = 1 to LEN /* ----------------------------- */
call charout 'stderr', 'BOCU-1' N || d2c( 13 )
MIX = STIR( SRC, N )
DST = BOCU.O( MIX )
if MIX == BOCU.I( DST )
then AVG = AVG + length( DST )
else exit CHECK( CASE, 6, MIX, BOCU.I( DST ))
end N
TEST.CASE.6 = format( AVG / LEN,, 0 )
call CHECK CASE, 6 ; SUM.6 = SUM.6 + TEST.CASE.6
end CASE /* ----------------------------- */
L = words( TEST ) - 2 ; say
do N = 0 to 6
if N = 0 then TEST...N = 'average'
else TEST...N = format( SUM.N / L,, 0 )
call CHECK ., N
end
return 0
STIR: procedure /* stir SRC sequence for BOCU-1: */
parse arg SRC, GAP
POS = 2 * ( length( SRC ) / 2 - GAP )
DST = '' /* move n-th character from SRC */
do N = 1 while SRC <> '' /* to DST for BOCU-1 comparison */
POS = ( POS + 2 * GAP ) // length( SRC )
DST = DST || substr( SRC, POS + 1, 2 )
SRC = left( SRC, POS ) || substr( SRC, POS + 3 )
end N
return DST
CHECK: procedure expose TEST. /* progress (or error) indicator */
parse arg CASE, N, WANT, GOT
if N > 0 then LINE = TEST.0.N left( c2x( TEST.1.N ), 10 )
else LINE = right( 'codepage:', 19 )
if GOT == WANT then do
do L = 2 to 9
if TEST.L.N <> 0 then LINE = LINE right( TEST.L.N, 5 )
end L
if TEST...N <> 0 then LINE = LINE right( TEST...N, 9 )
say LINE ; return 0
end
else do
LINE = LINE 'error in case' CASE
do L = 1 to length( WANT ) by 4
if substr( WANT, L, 4 ) \== substr( GOT, L, 4 ) then do
LINE = LINE 'wanted' c2x( substr( WANT, L, 4 ))
LINE = LINE 'but got' c2x( substr( GOT, L, 4 ))
leave L
end
end L
say LINE ; return 1
end
UTF.8I: procedure /* UTF-8 via UTF-4 to UTF-16BE */
parse arg SRC ; DST = ''
do while sign( length( SRC ))
POS = verify( SRC, xrange( '00'x, '7F'x )) - 1
if POS < 0 then leave ; DST = DST || left( SRC, POS )
SRC = substr( SRC, POS + 1 )
parse var SRC TOP 2 SRC ; LOS = length( SRC )
TMP = left( c2x( left( SRC, 1 )), 1 )
if TOP < 'C0'x | LOS = 0 then LEN = -0
else if TOP < 'C2'x then LEN = -1
else if TOP < 'E0'x then LEN = +1
else if TOP = 'E0'x & TMP = '8' then LEN = -2
else if TOP = 'E0'x & TMP = '9' then LEN = -2
else if TOP = 'ED'x & TMP = 'A' then LEN = -2
else if TOP = 'ED'x & TMP = 'B' then LEN = -2
else if TOP < 'F0'x then LEN = +2
else if TOP = 'F0'x & TMP = '8' then LEN = -3
else if TOP < 'F4'x then LEN = +3
else if TOP = 'F4'x & TMP = '8' then LEN = +3
else if TOP < 'F8'x then LEN = -3
else if TOP < 'FC'x then LEN = -4
else if TOP < 'FE'x then LEN = -5
else LEN = -0
BAD = ( LEN <= 0 ) ; LEN = abs( LEN )
if LOS < LEN then do
BAD = 1 ; LEN = LOS
end
CHR = left( SRC, LEN ) ; SRC = substr( SRC, LEN + 1 )
TMP = verify( CHR, xrange( '80'x, 'BF'x ))
if TMP > 0 then do
BAD = 1 ; SRC = substr( CHR, TMP ) || SRC
end
if BAD = 0 then do /* convert valid UTF-8 to bits */
TOP = x2b( c2x( TOP )) ; LEN = verify( TOP, 1 ) - 2
TOP = copies( 0, LEN ) || right( TOP, 6 - LEN )
do L = 1 to LEN /* determine 12, 18, or 24 bits: */
parse var CHR TMP 2 CHR
TOP = TOP || right( x2b( c2x( TMP )), 6 )
end
if LEN = 2 then TOP = 00 || TOP
if abbrev( TOP, 0000 ) then TOP = substr( TOP, 5 )
if abbrev( TOP, 0000 ) then TOP = substr( TOP, 5 )
LEN = length( TOP ) % 4
if LEN > 2 | abbrev( TOP, 100 ) then do
DST = DST || x2c( 8 || LEN )
do L = 1 to LEN /* use pieces of four bits */
parse var TOP TMP 5 TOP
DST = DST || x2c( 9 || b2x( TMP ))
end
end
else DST = DST || x2c( b2x( TOP ))
end
else DST = DST || '849F9F9F9D'x
end
return UTF.4I( DST || SRC )
UTF.8O: procedure /* UTF-16BE to UTF-8 encoder */
parse arg SRC ; DST = ''
LOS = length( SRC ) ; if LOS // 2 then signal UTF.ERR
do while LOS > 0
parse var SRC CHR 3 SRC ; LOS = LOS - 2
select
when CHR << '0080'x then do
DST = DST || right( CHR, 1 )
iterate
end
when CHR << 'D800'x then CHR = c2d( CHR )
when CHR >> 'DFFF'x then CHR = c2d( CHR )
when CHR >> 'DBFF'x then CHR = 65533
when LOS = 0 then CHR = 65533
otherwise
parse var SRC TMP 3 SRC ; LOS = LOS - 2
if TMP << 'DC00'x | 'DFFF'x << TMP then do
SRC = TMP || SRC ; CHR = 65533
LOS = LOS + 2 /* undo wrong trailing surrogate */
end
else CHR = 1024 * c2d( CHR ) + c2d( TMP ) - 56613888
end
BIN = reverse( x2b( d2x( CHR )))
CHR = ''
do LEN = 2 until verify( substr( BIN, 8 - LEN ), 0 ) = 0
CHR = CHR || left( BIN, 6, 0 ) || 01
BIN = substr( BIN, 7 )
end LEN
BIN = CHR || left( BIN, 8 - LEN, 0 ) || copies( 1, LEN )
DST = DST || x2c( b2x( reverse( BIN )))
end
return DST
UTF.7I: procedure /* UTF-7 via UTF-4 to UTF-16BE */
parse arg SRC ; DST = ''
B64 = 'abcdefghijklmnopqrstuvwxyz'
B64 = translate( B64 ) || B64 || '0123456789+/'
do while sign( length( SRC ))
POS = pos( '+', SRC ) - 1 ; if POS < 0 then leave
DST = DST || left( SRC, POS )
SRC = substr( SRC, POS + 2 )
if abbrev( SRC, '-' ) then do
DST = DST || '+' ; SRC = substr( SRC, 2 )
iterate /* decode '+-' as '+' = u+002B */
end
LEN = verify( SRC || '-', B64 ) - 1
if LEN = 0 then do /* '+' before non-B64 is invalid */
DST = DST || '849F9F9F9D'x
iterate
end
TOP = left( SRC, LEN ) ; SRC = substr( SRC, LEN + 1 )
if abbrev( SRC, '-' ) then SRC = substr( SRC, 2 )
U16 = '' ; POS = ( LEN * 6 ) // 8
do N = 1 to LEN /* decode B64 chars.s after '+' */
HEX = d2x( pos( substr( TOP, N, 1 ), B64 ) - 1 )
U16 = U16 || right( x2b( HEX ), 6, 0 )
end N
U16 = x2c( b2x( left( U16, LEN * 6 - POS )))
LEN = length( U16 )
if LEN // 2 | POS = 6 then do
if LEN // 2 then U16 = left( U16, LEN - 1 )
U16 = U16 || 'FFFD'x /* odd length or invalid padding */
end
DST = DST || UTF.4O( U16 )
end
return UTF.4I( DST || SRC )
UTF.7O: procedure /* UTF-16BE to UTF-7 encoder */
parse arg SRC ; DST = '' ; U16 = ''
LOS = length( SRC ) ; if LOS // 2 then signal UTF.ERR
B64 = 'abcdefghijklmnopqrstuvwxyz'
B64 = translate( B64 ) || B64 || '0123456789+/'
do while LOS > 0
parse var SRC CHR 3 SRC ; LOS = LOS - 2
select /* special cases '\', '~', etc. */
when abbrev( CHR, '09'x ) then nop
when abbrev( CHR, '20'x ) then nop
when CHR == '0009'x then CHR = '09'x
when CHR == '000A'x then CHR = '0A'x
when CHR == '000D'x then CHR = '0D'x
when CHR << '0020'x then nop
when CHR == '005C'x then nop
when CHR << '007E'x then CHR = right( CHR, 1 )
when CHR << 'D800'x then nop
when CHR >> 'DFFF'x then nop
when CHR >> 'DBFF'x then CHR = 'FFFD'x
when LOS = 0 then CHR = 'FFFD'x
otherwise
parse var SRC TMP 3 SRC ; LOS = LOS - 2
if TMP << 'DC00'x | 'DFFF'x << TMP then do
SRC = TMP || SRC ; CHR = 'FFFD'x
LOS = LOS + 2 /* undo wrong trailing surrogate */
end
else do
U16 = U16 || CHR ; CHR = TMP
end
end
if length( CHR ) = 2 then do
U16 = U16 || CHR ; if SRC \== '' then iterate
CHR = '' /* abbrev( '-', CHR ) = 1 at end */
end /* collect U16 until UTF-7 ASCII */
if U16 \== '' then do /* output U16 before UTF-7 ASCII */
DST = DST || '+' /* '+' (excl. '+-') starts a B64 */
U16 = x2b( c2x( U16 )) /* '-' or non-B64 terminates B64 */
U16 = U16 || copies( '00', ( length( U16 ) % 4 ) // 3 )
do while U16 <> ''
parse var U16 TMP 7 U16
DST = DST || substr( B64, x2d( b2x( TMP )) + 1, 1 )
end /* add '-' also if next is a '-' */
if sign( pos( CHR, B64 )) | abbrev( '-', CHR )
then DST = DST || '-'
end
if CHR = '+' then DST = DST || '+-'
else DST = DST || CHR
end
return DST
UTF.4I: procedure /* UTF-4 to UTF-16BE decoder */
parse arg SRC ; DST = ''
do while sign( length( SRC ))
parse var SRC TOP 2 SRC
if TOP < '80'x | 'A0'x <= TOP then do
DST = DST || right( TOP, 2, '00'x )
iterate
end
LEN = c2d( TOP ) - 128 /* check lead byte '82'x - '86'x */
if LEN < 2 | 6 < LEN then do
DST = DST || 'FFFD'x ; iterate
end
TOP = left( SRC, LEN ) ; SRC = substr( SRC, LEN + 1 )
POS = verify( TOP, xrange( '90'x, '9F'x ))
select
when POS > 0 | abbrev( TOP, '90'x ) then do
if POS > 0 then SRC = substr( TOP, POS ) || SRC
DST = DST || 'FFFD'x ; iterate
end /* found invalid UTF-4 tail byte */
when LEN = 3 | LEN = 5 then TOP = '90'x || TOP
when LEN = 4 & TOP << '9D989090'x then nop
when LEN = 4 & TOP >> '9D9F9F9F'x then nop
when LEN = 2 & abbrev( TOP, '98'x ) then nop
when LEN = 2 & abbrev( TOP, '99'x ) then nop
when LEN = 6 & abbrev( TOP, '9190'x ) then nop
when LEN = 2 | LEN = 4 | LEN = 6 then do
DST = DST || 'FFFD'x ; iterate
end /* reject invalid UTF-4 encoding */
end /* no OTHERWISE, match all cases */
LEN = LEN + LEN // 2 ; HEX = ''
do L = 1 to LEN /* UTF-4 tail byte '9?'x to hex. */
HEX = HEX || right( c2x( substr( TOP, L, 1 )), 1 )
end
select
when LEN = 2 then DST = DST || '00'x || x2c( HEX )
when LEN = 4 then do
DST = DST || x2c( left( HEX, 2 ))
DST = DST || x2c( right( HEX, 2 ))
end
when LEN = 6 then do /* encode UTF-16 surrogate pair: */
DEC = x2d( substr( HEX, 1, 2 )) - 1
DEC = x2d( substr( HEX, 3, 2 )) + 256 * DEC
DEC = x2d( substr( HEX, 5, 2 )) + 256 * DEC
DST = DST || d2c( 55296 + DEC % 1024 )
DST = DST || d2c( 56320 + DEC // 1024 )
end
end
end
return DST
UTF.4O: procedure /* UTF-16BE to UTF-4 encoder */
parse arg SRC ; DST = ''
LOS = length( SRC ) ; if LOS // 2 then signal UTF.ERR
do while LOS > 0
parse var SRC CHR 3 SRC ; LOS = LOS - 2
select
when CHR << '0080'x then DST = DST || right( CHR, 1 )
when CHR << '00A0'x then do
CHR = c2x( right( CHR, 1 ))
DST = DST || x2c( 82 )
DST = DST || x2c( 9 || left( CHR, 1 ))
DST = DST || x2c( 9 || right( CHR, 1 ))
end
when CHR << '0100'x then DST = DST || right( CHR, 1 )
when CHR << '1000'x then do
TOP = c2x( left( CHR, 1 ))
CHR = c2x( right( CHR, 1 ))
DST = DST || x2c( 83 )
DST = DST || x2c( 9 || right( TOP, 1 ))
DST = DST || x2c( 9 || left( CHR, 1 ))
DST = DST || x2c( 9 || right( CHR, 1 ))
end
when CHR << 'D800'x | 'DFFF'x << CHR then do
TOP = c2x( left( CHR, 1 ))
CHR = c2x( right( CHR, 1 ))
DST = DST || x2c( 84 )
DST = DST || x2c( 9 || left( TOP, 1 ))
DST = DST || x2c( 9 || right( TOP, 1 ))
DST = DST || x2c( 9 || left( CHR, 1 ))
DST = DST || x2c( 9 || right( CHR, 1 ))
end
when CHR >> 'DBFF'x then DST = DST || '849F9F9F9D'x
when LOS = 0 then DST = DST || '849F9F9F9D'x
otherwise
parse var SRC TMP 3 SRC ; LOS = LOS - 2
if TMP << 'DC00'x | 'DFFF'x << TMP then do
SRC = TMP || SRC ; DST = DST || '849F9F9F9D'x
LOS = LOS + 2 /* undo wrong trailing surrogate */
end
else do
CHR = 1024 * c2d( CHR ) + c2d( TMP ) - 56613888
TOP = ''
do until CHR = 0
TMP = CHR // 16 ; CHR = CHR % 16
TOP = x2c( 9 || d2x( TMP )) || TOP
end
DST = DST || x2c( 8 || length( TOP )) || TOP
end
end
end
return DST
UTF.1I: procedure /* UTF-1 to UTF-16BE decoder */
parse arg SRC ; DST = ''
do while sign( length( SRC ))
parse var SRC TOP 2 SRC
select
when TOP > 'FB'x then TOP = 4 233006 c2d( TOP ) - 252
when TOP > 'F5'x then TOP = 2 16406 c2d( TOP ) - 246
when TOP > 'A0'x then TOP = 1 256 c2d( TOP ) - 161
when TOP = 'A0'x then TOP = 1 66 0
otherwise
DST = DST || right( TOP, 4, '00'x )
iterate
end
parse var TOP T CHR TOP ; CHR = CHR + TOP * ( 190 ** T )
do N = T - 1 to 0 by -1
parse var SRC TOP 2 SRC ; L = c2d( TOP )
select /* accept trailing 21..7E/A0..FF */
when 160 <= L then L = L - 66
when 33 <= L & L < 127 then L = L - 33
otherwise /* reject trailing 00..20/7F..9F */
CHR = x2d( 'FFFD' ) ; SRC = TOP || SRC
leave N
end
CHR = CHR + L * ( 190 ** N )
end N
if 160 <= CHR & CHR <= 1114111
then DST = DST || x2c( d2x( CHR, 8 ))
else DST = DST || '0000FFFD'x
end /* bad UTF-1 A021..A07E replaced */
return UTF.32( DST ) /* UTF.32 handles any surrogates */
UTF.1O: procedure /* UTF-16BE to UTF-1 encoder */
parse arg SRC ; DST = ''
LOS = length( SRC ) ; if LOS // 2 then signal UTF.ERR
do while LOS > 0
parse var SRC CHR 3 SRC ; LOS = LOS - 2
select
when CHR << '00A0'x then do
DST = DST || right( CHR, 1 )
iterate
end
when CHR << 'D800'x then CHR = c2d( CHR )
when CHR >> 'DFFF'x then CHR = c2d( CHR )
when CHR >> 'DBFF'x then CHR = 65533
when SRC == '' then CHR = 65533
otherwise
parse var SRC TMP 3 SRC ; LOS = LOS - 2
if TMP << 'DC00'x | 'DFFF'x << TMP then do
SRC = TMP || SRC ; CHR = 65533
LOS = LOS + 2 /* undo wrong trailing surrogate */
end
else CHR = 1024 * c2d( CHR ) + c2d( TMP ) - 56613888
end
select
when CHR < 256 then TMP = ( CHR - 66 ) 160 1
when CHR < 16406 then TMP = ( CHR - 256 ) 161 1
when CHR < 233006 then TMP = ( CHR - 16406 ) 246 2
otherwise TMP = ( CHR - 233006 ) 252 4
end
parse var TMP CHR L T
DST = DST || d2c( L + CHR % ( 190 ** T ))
do N = T - 1 to 0 by -1 /* trailing bytes 21..7E, A0..FF */
L = ( CHR % ( 190 ** N )) // 190
if L < 94 then DST = DST || d2c( L + 33 )
else DST = DST || d2c( L + 66 )
end N
end
return DST
BOCU.I: procedure /* BOCU-1 to UTF-16BE decoder */
parse arg SRC ; DST = ''
; PREV = 64 /* (RD1) */
do while sign( length( SRC ))
parse var SRC TOP 2 SRC
select /* single RD4 or multi-byte RD5: */
when TOP = 'FF'x | TOP << '21'x then do /* (RD2) */
if TOP < 'FF'x then DST = DST || right( TOP, 4, '00'x )
if TOP \== ' ' then PREV = 64 /* (RD3) */
iterate /* if 255 only reset state (RD6) */
end
when TOP = 'FE'x then N = +187660 3 254
when TOP > 'FA'x then N = +10513 2 251
when TOP > 'CF'x then N = +64 1 208
when TOP > '4F'x then N = 0 0 144
when TOP > '24'x then N = -64 1 80
when TOP > '21'x then N = -10513 2 37
when TOP = '21'x then N = -187660 3 34
end
parse var N DIFF T N ; TOP = c2d( TOP ) - N
CHR = DIFF + PREV + TOP * ( 243 ** T )
do N = 1 to T
parse var SRC TOP 2 SRC ; TOP = c2d( TOP )
select
when 33 <= TOP then TOP = TOP - 13
when 28 <= TOP & TOP <= 31 then TOP = TOP - 12
when 16 <= TOP & TOP <= 25 then TOP = TOP - 10
when 1 <= TOP & TOP <= 6 then TOP = TOP - 1
otherwise
CHR = c2d( 'FFFD'x ) ; SRC = d2c( TOP ) || SRC
leave N /* restore any invalid tail byte */
end
CHR = CHR + TOP * ( 243 ** ( T - N ))
end N
PREV = BOCU.5( CHR ) ; DST = DST || x2c( d2x( CHR, 8 ))
end
return UTF.32( DST )
BOCU.O: procedure /* UTF-16BE to BOCU-1 encoder */
parse arg SRC ; DST = ''
LOS = length( SRC ) ; if LOS // 2 then signal UTF.ERR
PREV = 64 /* R1 */
do while LOS > 0
parse var SRC CHR 3 SRC ; LOS = LOS - 2
select
when CHR << 'D800'x then CHR = c2d( CHR )
when CHR >> 'DFFF'x then CHR = c2d( CHR )
when CHR >> 'DBFF'x then CHR = 65533
when LOS = 0 then CHR = 65533
otherwise
parse var SRC TMP 3 SRC ; LOS = LOS - 2
if TMP << 'DC00'x | 'DFFF'x << TMP then do
SRC = TMP || SRC ; CHR = 65533
LOS = LOS + 2 /* undo wrong trailing surrogate */
end
else CHR = 1024 * c2d( CHR ) + c2d( TMP ) - 56613888
end
if CHR <= 32 then do /* C0 control or space, R2 or R3 */
if CHR < 32 then PREV = 64 /* R3 */
DST = DST || d2c( CHR ) ; iterate /* R2 */
end
DIFF = CHR - PREV ; PREV = BOCU.5( CHR ) /* R4,R5 */
select /* R4.2 base LEAD bytes and R4.3 */
when 187660 <= DIFF then N = 3 254 ( DIFF - 187660 )
when 10513 <= DIFF then N = 2 251 ( DIFF - 10513 )
when 64 <= DIFF then N = 1 208 ( DIFF - 64 )
when -64 <= DIFF then N = 0 144 ( DIFF - 0 )
when -10513 <= DIFF then N = 1 80 ( DIFF + 64 )
when -187660 <= DIFF then N = 2 37 ( DIFF + 10513 )
otherwise N = 3 34 ( DIFF + 187660 )
end /* UTF-16 abs( DIFF ) <= 1114111 */
parse var N T LEAD DIFF ; TAIL = ''
do N = 1 to T /* determine trail bytes (R4.4): */
M = DIFF // 243 ; DIFF = DIFF % 243
if M < 0 then do /* non-negative modulo (R4.4a) */
M = M + 243 ; DIFF = DIFF - 1
end
select /* avoid 00, 07..0F, 1A..1B, 20: */
when M <= 5 then TAIL = d2c( M + 1 ) || TAIL
when M <= 15 then TAIL = d2c( M + 10 ) || TAIL
when M <= 19 then TAIL = d2c( M + 12 ) || TAIL
otherwise TAIL = d2c( M + 13 ) || TAIL
end
end N
DST = DST || d2c( LEAD + DIFF ) || TAIL /* R4.5 and R4.6 */
end
return DST
BOCU.5: procedure /* middle of page or CJK range */
arg DEC
select
when DEC < 12352 then return 64 + DEC - DEC // 128
when DEC < 12448 then return 12400 /* Hiragana (R5a) */
when DEC < 19968 then return 64 + DEC - DEC // 128
when DEC < 40870 then return 30481 /* Unihan (R5b) */
when DEC < 44032 then return 64 + DEC - DEC // 128
when DEC < 55204 then return 49617 /* Hangul (R5c) */
otherwise return 64 + DEC - DEC // 128
end
UTF.32: procedure /* internal UTF-32BE to UTF-16BE */
parse arg SRC ; DST = ''
do while sign( length( SRC )) /* internal use for good lengths */
parse var SRC TOP 3 LOW 5 SRC
select
when TOP == '0000'x & LOW << 'D800'x
then DST = DST || LOW
when TOP == '0000'x & LOW >> 'DFFF'x
then DST = DST || LOW
when TOP == '0000'x | TOP >> '0010'x
then DST = DST || 'FFFD'x
otherwise /* encode UTF-16 surrogate pair: */
DEC = c2d( right( TOP, 1 )) - 1
DEC = c2d( left( LOW, 1 )) + 256 * DEC
DEC = c2d( right( LOW, 1 )) + 256 * DEC
DST = DST || d2c( 55296 + DEC % 1024 )
DST = DST || d2c( 56320 + DEC // 1024 )
end
end
return DST
UTF.ERR: /* odd number of octets is fatal */
UTF.ERR = 'odd number of octets in UTF-16 string'
parse version . !V! .
if !V! < 6
then exit 22 + lineout( 'stderr:', UTF.ERR 'line' sigl )
else raise syntax 22 description ( UTF.ERR )