aboutsummaryrefslogtreecommitdiff
path: root/elymas/lib/utf8.ey
blob: a0bbdbc85c8c1cb4be9adf55137f3a6c9979a4f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
<
  <
    txt .consume .|hu "%" defq
    { "2120" "-" | |le "021" "-" | |ge |and } /in defq
    sys .asm .|rawAddress =*:addr

    { [ -01 ] cat }" =*:emit
    { 0 ==accum one =*next }' =*:init

    { { accum emit one =next }' -01 reverse
      { =*p =*n { {
        p { %3F band accum %40 mul add =accum n }'
          { < { ==c [ %FFFD ] cat init c next }' =*replace > ???enc.utf8.ill-formed }' ? *
      }' =next }' } each
    } /trail deffd

    [ # compare Table 3-7 of http://www.unicode.org/versions/Unicode6.3.0/ch03.pdf
      { _ %7F le     }" |emit
      { _ %C2 %DF in }" { %1F band =accum }' [ { _ %80 %BF in }" ] trail ;
      { _ %E0 eq     }" { %0F band =accum }' [ { _ %A0 %BF in }" { _ %80 %BF in }" ] trail ;
      { _ %E1 %EC in }" { %0F band =accum }' [ { _ %80 %BF in }" _                 ] trail ;
      { _ %ED eq     }" { %0F band =accum }' [ { _ %80 %9F in }" { _ %80 %BF in }" ] trail ;
      { _ %EE %EF in }" { %0F band =accum }' [ { _ %80 %BF in }" _                 ] trail ;
      { _ %F0 eq     }" { %07 band =accum }' [ { _ %90 %BF in }" { _ %80 %BF in }" _ ] trail ;
      { _ %F1 %F3 in }" { %07 band =accum }' [ { _ %80 %BF in }" _ _                 ] trail ;
      { _ %F4 eq     }" { %07 band =accum }' [ { _ %80 %8F in }" { _ %80 %BF in }" _ ] trail ;
      { -- 1 }" { < { [ %FFFD ] cat init }' =*replace > ???enc.utf8.ill-formed }
    ] ==:C { C conds } ==:one

    # convert string to sequence of UTF8 codepoints
    # 0 -> input string
    # 0 <- the code points represented by that string
    { init [ ] -01 { next } each
      one addr |next addr neq { < { [ %FFFD ] cat }' =*replace > ???enc.utf8.ill-formed } rep
    }
  > -- /consume deffd
> /utf8 defvd

# vim: syn=elymas