From cd7507dbe5f5a21e1cec6da96a4e2a38fc1139cc Mon Sep 17 00:00:00 2001 From: Marshall Lochbaum Date: Tue, 15 Jun 2021 22:31:33 -0400 Subject: =?UTF-8?q?Full=20tokenization=20for=20=F0=9D=95=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/c.bqn | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'src/c.bqn') diff --git a/src/c.bqn b/src/c.bqn index 5060beaa..deaa68b6 100644 --- a/src/c.bqn +++ b/src/c.bqn @@ -20,7 +20,7 @@ charSet‿cgl←(∾ ≍○< ≠¨)⟨ bF‿b1‿b2‿bS‿bG‿bB‿bL‿bO‿bX‿bN‿bD‿bA‿bW‿bP←≍¨˜⟜(0»+`)cgl M←1⊸⊑(0⊸≤∧>)-⟜⊑ # ∊ for an init,length pair 𝕩 as above sep←⊑bS -bI←bX+≍⟜-5 +bI←bX+≍⟜-5⋄bR←8+⊑bX Pl←∾⟜("s"/˜1<≠) # Pluralize _tmpl←{∾𝕗{𝕎𝕩}¨<𝕩} # Template # Convert characters to numbers, mostly the same as tokens @@ -56,11 +56,11 @@ Tokenize←{System‿vars←𝕨 ie←/f⋄is←ie≠⊸↑/1»f⋄Fs←{is/˜↩𝕨⋄𝕨/𝕩} # Token start and end is-↩is(-×⊏⟜c)ie # Comment → ending newline only t←CharCode ie⊏𝕩 - nd←(t=⊑bN)>«t M bD # Namespace dot - w←»⊸«t M bD⋄rr←t=bR # Namespace dot; 𝕣 + w←»⊸us - wk←na⌊∘÷˜(⊑bA)-˜w/t # Kind of word from first char + wk←(¬w/rr)×na⌊∘÷˜(⊑bA)-˜w/t # Kind of word from first char t-↩na×l∧t≥na+⊑bA # Case-insensitive {⟨𝕩/is,"System dot with no name"⟩!0}⍟(∨´)sy>«l w≠↩»⊸∨sy # Start system word at dot @@ -68,12 +68,15 @@ Tokenize←{System‿vars←𝕨 i←l>n←l∧(+`w)⊏0∾¬wi # Identifier/Number masks num←is ReadNums○(((0∾us)<∨⟜«0∾n)/0⊸∾) t×l # Numbers ir←(us/˜«⊸us)×+`w>n # Identifier groups - {⟨is⊏˜𝕩/𝕨,"Numbers can't start with underscores"⟩!0}⍟(∨´⊢)⟜(ws<(⊑bA)>⊏⟜t)/(¯1»⌈`)⊸us)×+`w>n # Identifier groups and first character + w↩if∨n∧w⋄ws←1=0⊸<⊸/wt/˜↩¬w/rr # Don't produce an identifier for 𝕣 + {⟨𝕩/is,"𝕣 can't be used with other word characters"⟩!0}⍟(∨´)(i>us)∧(rr⊸≠∨if⊸<)ig⊏0∾fr + {⟨is⊏˜𝕩/𝕨,"Numbers can't start with underscores"⟩!0}⍟(∨´⊢)⟜(ws<(⊑bA)>⊏⟜t)/rrrr)∾(ci∾/si)⊏+`»f # Indices in t k←id∾num‿chr‿str⋄k(⊢>¯1»⌈`)⊸/¨˜↩j←⊐¨k # IDs j into uniques k k↩System⌾(1⊸⊑)k # System value lookup wf←¬l∨t M bW⋄ie/˜↩wf∨>⟜«l # Index management for... @@ -82,7 +85,8 @@ Tokenize←{System‿vars←𝕨 p←≠`1¨sb←¯1↓1↓/1(∾≠∾˜)t=sep # Separator group boundaries (excludes leading and trailing) sk←sb/˜p>∨⟜«(p+(sb-p)⊏t)∊3‿5+⊑bB # Keep the first of each group that's not just inside a bracket t{ie/˜↩𝕨⋄𝕨Fs𝕩}˜↩1¨⌾(sk⊸⊏)t≠sep # Remove the rest - r←ir⌾((t M vd≍+´2↑kk)⊸/)(vd⌊t)⊏charRole∾0 # Role + im←(t=bR)∨t M vd≍+´2↑kk # Identifier (or 𝕣) mask + r←ir⌾(im⊸/)(vd⌊t)⊏charRole∾0 # Role t+↩(⊑bX)((⊢M≍⟜5)×5+3⊸+⊸≤)t # Case-insensitive special names t-↩vi(<+10×=)t # Shift . to bX and variables back one ⟨t,r,k,is,ie⟩ @@ -94,7 +98,7 @@ ReadNums←{ _err_←{(0!˜/⟜𝔾≍○<𝔽)⍟(∨´)} EChars←⟨"Letter"⊸Pl," """,⊏⟜charSet,""" not allowed in numbers"⟩_tmpl e‿d‿n‿p‿i←=⟜𝕩¨((⊑bA)+-´"ea")∾+⟜↕´bN # Masks for e.¯π∞ - EChars∘(/⟜𝕩)_err_𝕨 ¬e∨𝕩<⊑bA + EChars∘(/⟜𝕩)_err_𝕨 (𝕩=bR)∨¬e∨𝕩<⊑bA s←d∨c←e∨z←0=𝕩⋄m←¬n∨c "Negative sign in the middle of a number"_err_𝕨 n>»c "Portion of a number is empty"_err_𝕨 (1«s)∧n∨s -- cgit v1.2.3