From d5c1dd450ebd29d2041d26cdd8836ecf8fb7ff22 Mon Sep 17 00:00:00 2001 From: Marshall Lochbaum Date: Wed, 25 Jan 2023 09:02:13 -0500 Subject: Bootstrapping compilers with simplified syntax --- src/bootstrap/boot1.bqn | 285 +++++++++++++++++++++++++++++++++++++++++++++++ src/bootstrap/boot2.bqn | 139 +++++++++++++++++++++++ src/bootstrap/verify.bqn | 14 +++ 3 files changed, 438 insertions(+) create mode 100644 src/bootstrap/boot1.bqn create mode 100644 src/bootstrap/boot2.bqn create mode 100644 src/bootstrap/verify.bqn diff --git a/src/bootstrap/boot1.bqn b/src/bootstrap/boot1.bqn new file mode 100644 index 00000000..1a833abd --- /dev/null +++ b/src/bootstrap/boot1.bqn @@ -0,0 +1,285 @@ +# Comiler simplified once +# Full compiler minus error checking and index tracking +# Syntax simplified to avoid _ · ‿ ' 𝕏𝕎𝔽𝔾𝕗 and double quote in comment +# Already didn't have . [] :;? ⇐ 𝕣𝕤𝕘 +# Blocks are functions; no empty list literals; no useless parentheses +na←¯1⊑≢alph←("aA"+⌜↕26)∾˘"àÀ"+⌜(↕23)∾24+↕7 +lf←@+⟨10,13⟩ +⟨charSet,cgl⟩←(∾ ⋈ ≠¨)⟨ + "+-×÷⋆√⌊⌈|¬∧∨<>≠=≤≥≡≢⊣⊢⥊∾≍⋈↑↓↕«»⌽⍉/⍋⍒⊏⊑⊐⊒∊⍷⊔!" # Function + mod1←"˙˜˘¨⌜⁼´˝`" # 1-modifier + "∘○⊸⟜⌾⊘◶⎉⚇⍟⎊" # 2-modifier + "⋄,"∾lf # Separator + ":;?" # Header punctuation + "⇐←↩" # Gets + "(){}⟨⟩[]" # Bracket + "‿" # Ligature + "·" # nOthing + "𝕊𝕏𝕎𝔽𝔾𝕤𝕩𝕨𝕣𝕗𝕘" # Input + ".¯π∞" # Numeric + (⊑"0")+↕10 # Digit + "_"∾˜⥊alph # Alphabetic + "• "∾@+9 # Whitespace + ⟨cc,qs,qd,nc⟩←"#'""@" # Preprocessed characters +⟩ +⟨bF,b1,b2,bS,bH,bG,bB,bL,bO,bX,bN,bD,bA,bW,bP⟩←⋈¨˜⟜(0»+`)cgl +M←1⊸⊑(0⊸≤∧>)-⟜⊑ # ∊ for an init,length pair 𝕩 as above +sep←⊑bS +pred←2+⊑bH +bI←bX+⋈⟜-5⋄bR←8+⊑bX +# Convert characters to numbers, mostly the same as tokens +cgf←(cgr←⍋charSet)⊏charSet +CharCode←cgr⊏˜1-˜1⌈cgf⍋⊢ +swapundo←CharCode∊⟜mod1⊸/"˜⁼" + +vd←1+vi←⊑bN # Start of identifier numbering (plus dot) +charRole←4∾˜∾⥊¨˜⟜(≠↑cgl˙)⟨1,2,3,¯1,¯1,¯3,⟨¯1,0⟩,¯2,0,¬/⟨5,6⟩⟩ # For first vd chars +T←⌈`× ⋄ IT←↕∘≠⊸T ⋄ I1T←(1+↕∘≠)⊸T +PN←1(∾/∾˜)(∨/⊣) # Partitioned-none: partitions where 𝕨<𝕩 is never 1 + +# Source to ⟨tokens, roles, number of identifiers, literals⟩ +# Identifiers then literal tokens are numbered starting at vi +Tokenize←{⟨System,vars⟩←𝕨 + # Resolve comments and strings + c←𝕩=cc⋄s←/⟨0,0⟩⊸«⊸∧sm←𝕩=qs⋄d←/dm←𝕩=qd + g←⍋q←∾⟨ s⋄¯1↓d⋄/c⟩ ⋄q↩g⊏q # Open indices + e← g⊏∾⟨2+s⋄ 1↓d⋄-⟜»∘⊏⟜(0∾+`c)⊸//(𝕩∊lf)∾1⟩ # Matching close indices + Se←≠(>/⊢)∾⟜≠{(⊏˜𝕨)𝕊⍟(≠○(¯1⊸⊑))𝕩∾𝕩⊏𝕨}⟨0⟩˙ # Find reachable openings + St←(≠𝕩)↑(/⁼(Se q⍋e)⊸⊏) # All indices → reached mask + a←St q⋄b←St e⋄f←1≠`ab←a∨b # Open/close masks; filter + + # Extract character and string literals + u←f∧𝕩=nc⋄ci←/u∨»a∧sm + chr←(⊏⟜𝕩-(nc-@)×⊏⟜u)ci # Characters (indices ci) + f>↩qe←dm∧«a∧↩dm # Quote Escape + str←𝕩⊔˜1-˜(si←a>»qe)(⊣+`⊸×○(∾⟜1)<)≠`dm∧ab # Strings (indices /si) + + # Extract words: identifiers and numbers + t←CharCode f/𝕩 + nd←(t=⊑bN)>«t M bD⋄rr←t=bR # Namespace dot; 𝕣 + w←»⊸n←l∧(+`w)⊏0∾¬wi # Identifier/Number masks + num←ReadNums○(((0∾us)<∨⟜«0∾n)/0⊸∾) t×l # Numbers + ir←(us/˜«⊸us)×+`w>n # Identifier groups and first character + fr←(1=wi/wt)rr)∾(ci∾/si)⊏+`»f # Indices in t + k←id∾⟨num,chr,str⟩⋄k(⊢>¯1»⌈`)⊸/¨˜↩j←⊐¨k # IDs j into uniques k + k↩System⌾(1⊸⊑)k # System value lookup + wf←¬l∨t M bW # Index management for... + t↩(w∨wf)/(vars≠⊸↓∾j++`vd»kk←≠¨k)⌾(ki⊸⊏)t # Add IDs; remove words/whitespace + t-↩t(M×-⟜⊑)bS # Separators are equivalent + p←≠`1¨sb←¯1↓1↓/1(∾≠∾˜)t=sep # Separator group boundaries (excludes leading and trailing) + eb←⟨3,5,7⟩+⊑bB # End brackets that allow separators + sk←sb/˜p>∨⟜«(M⟜bH∨eb∊˜p⊸+)(sb-p)⊏t # Keep the first of each group that's not just inside a bracket + t/˜↩1¨⌾(sk⊸⊏)t≠sep # Remove the rest + im←(t=bR)∨t M vd⋈+´2↑kk # Identifier (or 𝕣) mask + r←ir⌾(im⊸/)(vd⌊t)⊏charRole∾0 # Role + t+↩(⊑bX)((⊢M⋈⟜5)×5+3⊸+⊸≤)t # Case-insensitive special names + t-↩vi(<+10×=)t # Shift . to bX and variables back one + ⟨t,r,k⟩ +} + +# 𝕩 is a list of tokens that contains the numeric literals, each +# preceded by 0. Return the numbers. +ReadNums←{ + ⟨e,d,n,p,i⟩←=⟜𝕩¨((⊑bA)+-´"ea")∾+⟜↕´bN # Masks for e.¯π∞ + c←e∨z←0=𝕩⋄m←¬n∨c + f←(17≥¬(⊢-T)+`)⊸∧g←(«≤(d<𝕩≠⊑bD)>○I1T¬)⊸∧m # No leading 0s; max 17 digits + l←(¯1∾⟨π,1⟩∾↕10)⊏˜(¬d)/f×𝕩-1+⊑bN # Digit lookup, with ∞ as 1 to avoid ∞×0 + v←(>⟜«0≤l)/0(0⊸≤××⟜10⊸+)`l # Numeric values—mantissas and exponents + v×↩⟨1,¯1⟩⊏˜(r←>⟜»m)/»n # Negate if ¯ + vm←c/z # Mask of mantissas in l + dp←vm/f(--»⊸-(<×⊢)⊏⟜(I1T«d)⊸-)○(/>⟜«)g # Decimal position + t←10⋆|ee←dp-˜vm/«v׬vm # Power of 10 + t÷˜⌾((0>ee)⊸/)t×⌾((0«dc<0≤r+p # Role ¯4 for exports: ⊑bG is ⇐ + sr←»⌾(((⍋⊏⟜dl)⊸⊏g)⊸⊏)sl←«⊸∨r=¯2⋄ns←¬sl∨sr # Strand right and left; not stranded + cp←𝕩=1+⊑bB # Closed paren + nr←(IT¬cp)⊏(𝕩=2+⊑bI)+2×𝕩=⊑bO # Nothingness role: 1 for 𝕨, 2 for · + nx←0 + g⊏˜↩⍋g⊏sdl←sl∨dl # Avoid reordering strands and dots in rev + rp←≠⊸»⌾(g⊸⊏)↕≠r # Position of previous, for roles + # Permutation to reverse each expression: *more* complicated than it looks + rev←⍋+`¯1↓(¯1∾g)(⊣⍋⊸⊏⊏˜⟜⍋¬⊏˜)⍋+`⊸+1∾g⊏sdl∨r=¯1 + gf←⍋fd←+`br←rev⊏p×𝕩M⟨2+⊑bB,2⟩ # Order by brace depth fd to de-nest blocks + rev⊏˜↩gf⋄fd⊏˜↩gf⋄br⊏˜↩gf + 𝕩⊏˜↩rev⋄dc⊏˜↩rev + + # Compute parsing ordering gr≡g⊏rev + BE←=∨+⟜2⊸= # Bracket equals: match ⟨[ or ⟩] given ⟨ or ⟩ only + g↩⍋+`p↩br-˜rev⊏p⋄bp←0(<⋈○(/⟜g)>)g⊏p # Order by non-brace bracket depth + g⊏˜↩⍋g⊏«⊸∨dc⋄gr←g⊏rev # Now by dots + sll←1+2÷˜0(<-○/>)gr⊏sr-sl⋄l←/g⊏𝕩BE˜5+⊑bB # Strand length; list starts + b←br>0⋄c←/br<0⋄bp∾¨↩⟨/b,c⟩ # Block Begin (mask) and Close (index), in matching order + g⊏˜↩gs←⍋gr⊏sl⋄gr↩g⊏rev⋄gi←⍋g # Send strand prefixes *‿ to the end + + # Headers + hh←𝕩=⊑bH⋄cs←𝕩=1+⊑bH # Case header : and separator ; + fi←+`cb←b∨cs⋄H←cb¬∘PN⊢ # Body index fi; which bodies Have a property + cq←(H𝕩=pred)∨ch←H hh # ch: body has : header ; cq: or ? predicate + cf←1∾¬co←cb/cs⋄cm←0∾∨⟜«co # cf: body is first; cm: body is one of multiple + cc←(⍋⍋«co)⊏c∾/cs # Case close + hi←/hf←hh⊏˜⟜IT⌾((⌽g)⊸⊏)cb∨hh # Header component indices + un←0=us←swapundo(≠∘⊣-⊐)hi⊏𝕩 + ut←un/»us⋄hi/˜↩0=us # Undo type: 0 normal, 1 ⁼, 2 ˜⁼ + hr←(⊏⟜ns×⊏⟜r)rev⊏˜hi # Header component roles + hl←2=hn←(1⊸»+«)hc←¯1=hr # hl: is label, hc: is : + ho←(»∨(«(hr=3)∧⊢))hl<2≤hr # Header operands + hm←¬ho∨ha←ho<(0=hr)∧1=hn # Mask for main name; header arguments + hk←3|1-˜(+`bI∾nv)⍋hi⊏𝕩׬rev⊏sr # Kind: 0 special, 1 name, 2 compound + hma←hm>hla←hl∧(0=hr)∧1≠hk⋄hr+↩hla⋄hl>↩hla # Lone non-name subject is 𝕩 with 𝕊 omitted + hv←(hla+ha×1+«hc)+(ho×4+«3=hr)+hma×3×1-˜2⌊hr # Special name for position + hk×↩¬hc∨hl∧0=hr # Treat subject labels like special names + hm>↩hc⋄hr/˜↩hm⋄hx←(1»hc)/ha # Header-derived role hr and immediacy ¬hx + ut-↩-⟜»ut×ho # Shift ⁼ from right operand to main name + ut/˜↩hm⋄hx∨↩1=hr + cwh←hc/»hl⌈ha×1+he←0≠hk # Body 𝕨 for just headers + ut2←2=ut + cw←(cwh⌈2×ut2)⌾(ch⊸/)1+-⟜«(»cq)<1(⊢<«)cf # Body 𝕨: 0 no, 1 allowed, 2 required + hl/˜↩hm⋄hu←(¬he)⌾(hi⊸⊏)hf # hu: mask of header special names + hj←gi⊏˜he/hi⋄hd←2=he/hk # hj: header assignments; hd: which ones destructure + + # Block properties + ss←⟨0,3,5,6⟩⍋(⊢+(0sl # Indices of list literals + lm←(0¨sll)∾˜(5+⊑bB)-˜l0⊏𝕩 # List merge, adding 2 for [] + + # Parsing part 1 + a←(¯5⊸<∧≤⟜¯3)r⋄ps←a↩alm←ai⊏aa⋄al←alm/ai # aliases al + ai/˜↩af⋄at/˜↩af∾1¨hj + + # Lexical resolution (independent of parsing part 2 below) + di←/dm←»dc # Dots aren't scoped + id←/(hu∨dm∨gi⊏«aa∧a)<(0⊸≤∧<⟜nv)xv # Identifier indices in xv + sa←0)ia∾sa # Opcode + idoc←⟨32¨hj,0¨hj,he/hv + 64¨di,di⊏xv, ido,idd∾0¨sp,idi∾spi⟩ # Identifier bytecode: instruction, depth, slot + + # Parsing part 2 + ta←tr∧2(>∨|)ps(⊢-T)+`¬ro # Train argument (first-level) + fa←/(hg∨ta∨ro∨«⊸∨ps(2=ne)∨ls∨»r=¯5⋄rt←/fo # Drop (block separator) and return + qp←/𝕩=pred # Predicate + fl←(dy×⊏⟜os)⊸+fa+dy # Function application site + dr∾↩((1+dy)×fn←2=fm←fa⊏ne)/fl # Turn function applications on · to drops + fn↩¬fn⋄fa/˜↩fn⋄fl/˜↩fn # And remove them + + # Object code generation: numbers oc ordered by source location (after rev) oi + ao←48+(0⌈(1+⊑bG)-˜ai⊏𝕩+ma+mm)∾-hd # Assignment opcode + or←⍋idor∾g⊏˜∾⟨cn,cn,bk,bk,hq,api,2/l,at,dr,qp,al+1,al+1,oa+1⌈oa⊏os,fl,rt⟩ + oc←or⊏∾idoc∾⟨0¨cn,ob,1¨bk,1+↕≠bk,43¨hq,44¨api,⥊⍉(11+lm+l⊏aa)≍ll,ao,6¨dr,42¨qp,66¨al,vi-˜(al-1)⊏𝕩 + 24+oa⊏r,16+(fn/dy+2×fm⌈1=ny)+4×0blk, <˘⍉>bdy⟩ +} diff --git a/src/bootstrap/boot2.bqn b/src/bootstrap/boot2.bqn new file mode 100644 index 00000000..7ffa780c --- /dev/null +++ b/src/bootstrap/boot2.bqn @@ -0,0 +1,139 @@ +# Compiler simplified twice +# Suitable for compiling once-simplified compiler +# Single-scope; no modified or list assignment; no 2-modifiers +{ + charSet←∾chars←⟨ + "+-×÷⋆√⌊⌈|¬∧∨<>≠=≤≥≡≢⊣⊢⥊∾≍⋈↑↓↕«»⌽⍉/⍋⍒⊏⊑⊐⊒∊⍷⊔!" # Function + "˙˜˘¨⌜⁼´˝`" # 1-modifier + "∘○⊸⟜⌾⊘◶⎉⚇⍟⎊" # 2-modifier + "⋄,"∾lf←@+10 # Separator + "←↩" # Gets + "(){}⟨⟩" # Bracket + "𝕊𝕩𝕨" # Input + "¯π∞" # Numeric + (⊑"0")+↕10 # Digit + ⥊"aA"+⌜↕na←26 # Alphabetic + " " # Whitespace + sc←"#""@" # Preprocessed characters + ⟩ + cm←((0»+`)⋈¨⊢)cgl←≠¨chars + bS←3⊑cm⋄bG←4⊑cm⋄bB←5⊑cm⋄bI←6⊑cm⋄bN←7⊑cm⋄bA←9⊑cm⋄bW←10⊑cm + sep←⊑bS + vi←⊑bN # Start of identifier numbering + charRole←∾cgl⥊¨⟨1,2,3,¯1,¯3,⟨¯1,0⟩,⟨1,0,0⟩,0,0,26/⟨0,1⟩,4,0⟩ + TT←⌈`× ⋄ IT←(↕≠)TT⊢ ⋄ I1T←(1+(↕≠))TT⊢ + + # Comments and strings + s←≠`dd←𝕩=1⊑sc + f←s<(I1T s<𝕩=⊑sc)≤I1T𝕩=lf # Filter comments + chr←@¨ci←/f∧𝕩=2⊑sc # Characters (indices ci) + f↩f>qe←dd∧«sd←s∧dd # Quote Escape + si←sd>»qe # String indices + str←𝕩⊔˜1-˜(+`si∾1)×(si»l # Word chars l, start w + wi←(⊑bA)≤w/t # Type: 0 number, 1 identifier + r←t⊏charRole # Role + t↩t-na×l∧t≥na+⊑bA # Case-insensitive + i←l>n←l∧(+`w)⊏0∾¬wi # Identifier/Number masks + ide←(1-˜i×+`w>n)⊔t⊏charSet # Identifiers + + # Numbers, at most 2 digits + nt←((⊢∨«)0∾n)/0∾t×l # Number tokens separated by 0 + nn←nt=⊑bN⋄m←¬nn∨0=nt # Mask for ¯; digits + nl←(0∾⟨π,∞⟩∾↕10)⊏˜m×nt-⊑bN # Digit lookup + ns←⟨1,¯1⟩⊏˜(m>»m)/»nn # Negate if ¯ + num←ns×(m>«m)/nl+10×»nl # Numeric values + + # Deduplicate literals and identifiers; other cleanup + # Identifiers then literal tokens are numbered starting at vi + ki←((⍒wi)⊏/w)∾(ci∾/si)⊏+`»f # Indices in t + k←⟨ide,⟨⟩,num,chr,str⟩ # Constants + k↩k/¨˜(⊢>¯1»⌈`)¨j←⊐¨k # IDs j into uniques k + wr←w∨¬l∨t=⊑bW⋄r↩wr/r⋄c←≠t + t↩wr/(c↑⍋(⊢+c×⊒)ki∾↕c)⊏(∾j++`vi»≠¨k)∾t # Add IDs; remove words/whitespace + t↩t-(t<+´bS)×(⊢×0≤⊢)t-⊑bS # Separators are equivalent + pb←≠`1¨sb←¯1↓1↓/1(∾≠∾˜)t=sep # Separator group boundaries (excludes leading and trailing) + eb←⟨3,5⟩+⊑bB # End brackets that allow separators + sk←sb/˜pb>(⊢∨«)eb∊˜pb+(sb-pb)⊏t # Keep the first of each group that's not just inside a bracket + sr←((≠t)↑/⁼sk)∨t≠sep⋄r↩sr/r⋄t↩sr/t # Remove the rest + 𝕩↩t⋄nv←≠⊑k + # End of tokenization! + + # Bracket roles + # Open brackets initially have role ¯1 and closed ones have role 0 + gb←⍋+`p←(¯1-2×r)×(𝕩≥⊑bB)∧𝕩<+´bB # Paren (actually any bracket type) depth and grade + r↩r+𝕩=3+⊑bB # Assume blocks are functions + cp←𝕩=1+⊑bB # Closed paren + rp←(⍋gb)⊏(≠gb)»gb # Position of previous, for roles + ir←((IT cp≤⊢)⊏⊢)(rp⊏0∾˜3=r)∨00⋄bc←/br<0 # Block Begin (mask) and Close (index), in matching order + 𝕩↩gr⊏𝕩⋄r↩gr⊏r⋄ir↩gr⊏ir + + # Parsing part 1 + a←¯3=r⋄ps←a⊢)(⍋+`ig⊏d∾¯1¨dv)⊏ig # Last order by declaration depth + d↩ig⊏d⋄id↩ig⊏id + ia←0<(id∾sp)⊏ac # Which are assignments + idd←(⊢-(IT d)⊏⊢)id⊏fd # Identifier frame depth + idi←((¯1+`d)⊏(⍋⍋d/ig)⊏⊢)(⊒+⊢⊏fsc˙)df # Slot within frame + spi←3+sp⊏xv # Special name index + uu←(1«d)∧d((+`⊣)⊏1(∾/∾˜)(∨/⊣))0)ia,idd∾0¨sp,idi∾spi⟩ # Identifier bytecode: instruction, depth, slot + + # Parsing part 2 + ta←tr∧2(>∨|)ps(⊢-TT)+`¬ro # Train argument (first-level) + fa←/(ta∨ro∨(«∨⊢)psls⋄rt←/fo # Drop (block separator) and return + fl←(⊢+dy×⊢⊏os˙)fa+dy # Function application site + + # Object code generation: numbers oc ordered by source location (after rev) oi + or←⍋idor∾g⊏˜∾⟨cn,cn,bk,bk,2/lb,at,dr,oa+1⌈oa⊏os,fl,rt⟩ + oc←or⊏∾idoc∾⟨0¨cn,ob,1¨bk,1+↕≠bk,⥊⍉(11+lb⊏aa)≍ll,48+ao,6¨dr + 24+oa⊏r,16+dy+4×fa⊏tr,¯1↓rc←7¨fx⟩ + # Output + fz←⟨0¨fx,¬fx,↕≠fx⟩ # Per-function data + cz←⟨/1∾or≥(≠oc)-≠rt,fsc+≠¨dn,dn,0¨¨dn⟩ # Per-body data + ⟨oc∾¯1⊑rc,∾⟨u⊏𝕨⟩∾1↓k,<˘⍉>fz,<˘⍉>cz⟩ # Overall output +} diff --git a/src/bootstrap/verify.bqn b/src/bootstrap/verify.bqn new file mode 100644 index 00000000..66939576 --- /dev/null +++ b/src/bootstrap/verify.bqn @@ -0,0 +1,14 @@ +glyphs ← •Import "../glyphs.bqn" +gl ← ("⟨"∾"⟩"«∾","⊸∾¨'"'(⊣∾∾˜)¨glyphs) # Has to replace •args in c.bqn + +f ← "../c.bqn"‿"boot1.bqn"‿"boot2.bqn" # Files to test +c ← (1‿2/⟨glyphs⊸•Import,•Import⟩) {𝕎𝕩}¨ f # Resulting compilers +c ↩ (∾glyphs){𝕗⊸𝕏}¨ c +t ← (∾∾⟜(@+10)¨)¨ (¯5⊸↓∾gl˙)⌾⊑⌾⊑ •FLines¨ f # Compiler source + +# 4⊸↑ to strip source info +# ⋈⁼∘∾⍟=¨⌾(2⊑¨2⊸⊑) to turn only-dyadic functions to ambivalent +! ≡○(4⊸↑)´ (2↑c) {𝕎𝕩}¨ ⊏t +•Out "Boot -1 verified!" +! ≡○(⋈⁼∘∾⍟=¨⌾(2⊑¨2⊸⊑) 4⊸↑)´ (0‿2⊏c) {𝕎𝕩}¨ 1⊏t +•Out "Boot -2 verified!" -- cgit v1.2.3