aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDrahflow <drahflow@gmx.de>2014-03-28 02:07:00 +0100
committerDrahflow <drahflow@gmx.de>2014-03-28 02:07:00 +0100
commit0553c5f0ac14184df6af0579b69df3079db75830 (patch)
tree000b1857fdca441ef33b95775dccb147a695858d
parentb5362e4f9060cd2f69040b25aefe248e02182fe8 (diff)
UTF-8 decoder
-rw-r--r--elymas/lib/utf8.ey40
1 files changed, 40 insertions, 0 deletions
diff --git a/elymas/lib/utf8.ey b/elymas/lib/utf8.ey
new file mode 100644
index 0000000..cde6ebc
--- /dev/null
+++ b/elymas/lib/utf8.ey
@@ -0,0 +1,40 @@
+<
+ <
+ txt .consume .|hu "%" defq
+ { "2120" "-" | |le "021" "-" | |ge |and } /in defq
+ sys .asm .|rawAddress =*:addr
+
+ { [ -01 ] cat }" =*:emit
+ { 0 ==accum one =*next }' =*:init
+
+ { { accum emit one =next }' -01
+ { =*p =*n { {
+ p {
+ %3F band accum %40 mul add =accum n
+ }' { < { ==c [ %FFFD ] cat init c next }' =*replace > ???enc.utf8.ill-formed }' ? *
+ }' =next }' } each
+ } /trail deffd
+
+ [ # compare Table 3-7 of http://www.unicode.org/versions/Unicode6.3.0/ch03.pdf
+ { _ %00 %7F in }" |emit
+ { _ %C2 %DF in }" { %1F band =accum }' [ { _ %80 %BF in }" ] trail ;
+ { _ %E0 eq }" { %0F band =accum }' [ { _ %80 %BF in }" { _ %A0 %BF in }" ] trail ;
+ { _ %E1 %EC in }" { %0F band =accum }' [ { _ %80 %BF in }" _ ] trail ;
+ { _ %ED eq }" { %0F band =accum }' [ { _ %80 %BF in }" { _ %80 %9F in }" ] trail ;
+ { _ %EE %EF in }" { %0F band =accum }' [ { _ %80 %BF in }" _ ] trail ;
+ { _ %F0 eq }" { %07 band =accum }' [ { _ %80 %BF in }" _ { _ %90 %BF in }" ] trail ;
+ { _ %F1 %F3 in }" { %07 band =accum }' [ { _ %80 %BF in }" _ _ ] trail ;
+ { _ %F4 eq }" { %07 band =accum }' [ { _ %80 %BF in }" _ { _ %80 %8F in }" ] trail ;
+ { -- 1 }" { < { [ %FFFD ] cat init }' =*replace > ???enc.utf8.ill-formed }
+ ] ==:C { C conds } ==:one
+
+ # convert string to sequence of UTF8 codepoints
+ # 0 -> input string
+ # 0 <- the code points represented by that string
+ { init [ ] -01 { next } each
+ one addr |next addr neq { < { [ %FFFD ] cat }' =*replace > ???enc.utf8.ill-formed } rep
+ }
+ > -- /consume deffd
+> /utf8 defvd
+
+# vim: syn=elymas