diff options
| author | Drahflow <drahflow@gmx.de> | 2014-03-28 02:07:00 +0100 |
|---|---|---|
| committer | Drahflow <drahflow@gmx.de> | 2014-03-28 02:07:00 +0100 |
| commit | 0553c5f0ac14184df6af0579b69df3079db75830 (patch) | |
| tree | 000b1857fdca441ef33b95775dccb147a695858d | |
| parent | b5362e4f9060cd2f69040b25aefe248e02182fe8 (diff) | |
UTF-8 decoder
| -rw-r--r-- | elymas/lib/utf8.ey | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/elymas/lib/utf8.ey b/elymas/lib/utf8.ey new file mode 100644 index 0000000..cde6ebc --- /dev/null +++ b/elymas/lib/utf8.ey @@ -0,0 +1,40 @@ +< + < + txt .consume .|hu "%" defq + { "2120" "-" | |le "021" "-" | |ge |and } /in defq + sys .asm .|rawAddress =*:addr + + { [ -01 ] cat }" =*:emit + { 0 ==accum one =*next }' =*:init + + { { accum emit one =next }' -01 + { =*p =*n { { + p { + %3F band accum %40 mul add =accum n + }' { < { ==c [ %FFFD ] cat init c next }' =*replace > ???enc.utf8.ill-formed }' ? * + }' =next }' } each + } /trail deffd + + [ # compare Table 3-7 of http://www.unicode.org/versions/Unicode6.3.0/ch03.pdf + { _ %00 %7F in }" |emit + { _ %C2 %DF in }" { %1F band =accum }' [ { _ %80 %BF in }" ] trail ; + { _ %E0 eq }" { %0F band =accum }' [ { _ %80 %BF in }" { _ %A0 %BF in }" ] trail ; + { _ %E1 %EC in }" { %0F band =accum }' [ { _ %80 %BF in }" _ ] trail ; + { _ %ED eq }" { %0F band =accum }' [ { _ %80 %BF in }" { _ %80 %9F in }" ] trail ; + { _ %EE %EF in }" { %0F band =accum }' [ { _ %80 %BF in }" _ ] trail ; + { _ %F0 eq }" { %07 band =accum }' [ { _ %80 %BF in }" _ { _ %90 %BF in }" ] trail ; + { _ %F1 %F3 in }" { %07 band =accum }' [ { _ %80 %BF in }" _ _ ] trail ; + { _ %F4 eq }" { %07 band =accum }' [ { _ %80 %BF in }" _ { _ %80 %8F in }" ] trail ; + { -- 1 }" { < { [ %FFFD ] cat init }' =*replace > ???enc.utf8.ill-formed } + ] ==:C { C conds } ==:one + + # convert string to sequence of UTF8 codepoints + # 0 -> input string + # 0 <- the code points represented by that string + { init [ ] -01 { next } each + one addr |next addr neq { < { [ %FFFD ] cat }' =*replace > ???enc.utf8.ill-formed } rep + } + > -- /consume deffd +> /utf8 defvd + +# vim: syn=elymas |
