src/bootstrap/boot2.bqn


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

# Compiler simplified twice
# Suitable for compiling once-simplified compiler
# Single-scope; no modified or list assignment; no 2-modifiers
{
  charSet←∾chars←⟨
    "+-×÷⋆√⌊⌈|¬∧∨<>≠=≤≥≡≢⊣⊢⥊∾≍⋈↑↓↕«»⌽⍉/⍋⍒⊏⊑⊐⊒∊⍷⊔!" # Function
    "˙˜˘¨⌜⁼´˝`"           # 1-modifier
    "∘○⊸⟜⌾⊘◶⎉⚇⍟⎊"         # 2-modifier
    "⋄,"∾lf←@+10          # Separator
    "←↩"                  # Gets
    "(){}⟨⟩"              # Bracket
    "𝕊𝕩𝕨"                 # Input
    "¯π∞"                 # Numeric
    (⊑"0")+↕10            # Digit
    ⥊"aA"+⌜↕na←26         # Alphabetic
    " "                   # Whitespace
    sc←@+⟨35,34,64⟩       # Preprocessed characters: hash, double quote, @
  ⟩
  cm←((0»+`)⋈¨⊢)cgl←≠¨chars
  bS←3⊑cm⋄bG←4⊑cm⋄bB←5⊑cm⋄bN←7⊑cm⋄bA←9⊑cm⋄bW←10⊑cm
  sep←⊑bS
  vi←⊑bN  # Start of identifier numbering
  charRole←∾cgl⥊¨⟨1,2,3,¯1,¯3,⟨¯1,0⟩,⟨1,0,0⟩,0,0,26/⟨0,1⟩,4,0⟩
  TT←⌈`× ⋄ IT←(↕≠)TT⊢ ⋄ I1T←(1+(↕≠))TT⊢

  # Comments and strings
  s←≠`dd←𝕩=1⊑sc
  f←s<(I1T s<𝕩=⊑sc)≤I1T𝕩=lf                 # Filter comments
  chr←@¨ci←/f∧𝕩=2⊑sc                        # Characters (indices ci)
  f↩f>qe←dd∧«sd←s∧dd                        # Quote Escape
  si←sd>»qe                                 # String indices
  str←𝕩⊔˜1-˜(+`si∾1)×(si<s)∾1               # Strings

  # Extract words: identifiers and numbers
  t←charSet⊐f/𝕩                             # Tokens
  r←t⊏charRole                              # Role
  l←(t≥⊑bN)∧t<⊑bW⋄w←l>»l                    # Word chars l, start w
  wi←(⊑bA)≤w/t                              # Type: 0 number, 1 identifier
  t↩t-na×l∧r=1                              # Case-insensitive
  n←l∧(+`w)⊏0∾¬wi                           # Number mask
  ide←(1-˜(l>n)×+`w>n)⊔t⊏charSet            # Identifiers

  # Numbers, at most 2 digits
  nt←((⊢∨«)0∾n)/0∾t×l                       # Number tokens separated by 0
  nn←nt=⊑bN⋄m←¬nn∨0=nt                      # Mask for ¯; digits
  nl←(0∾⟨π,∞⟩∾↕10)⊏˜m×nt-⊑bN                # Digit lookup
  ns←⟨1,¯1⟩⊏˜(m>»m)/»nn                     # Negate if ¯
  num←ns×(m>«m)/nl+10×»nl                   # Numeric values

  # Deduplicate literals and identifiers; other cleanup
  # Identifiers then literal tokens are numbered starting at vi
  ki←((⍒wi)⊏/w)∾(ci∾/si)⊏+`»f               # Indices in t
  k←⟨ide,⟨⟩,num,chr,str⟩                    # Constants
  k↩k/¨˜(⊢>¯1»⌈`)¨j←⊐¨k                     # IDs j into uniques k
  wr←w∨¬l∨t=⊑bW⋄r↩wr/r⋄c←≠t
  t↩wr/(c↑⍋(⊢+c×⊒)ki∾↕c)⊏(∾j++`vi»≠¨k)∾t    # Add IDs; remove words/whitespace
  t↩t-(t<+´bS)×(⊢×0≤⊢)t-⊑bS                 # Separators are equivalent
  pb←≠`1¨sb←¯1↓1↓/1(∾≠∾˜)t=sep              # Separator group boundaries (excludes leading and trailing)
  eb←⟨3,5⟩+⊑bB                              # End brackets that allow separators
  sk←sb/˜pb>(⊢∨«)eb∊˜pb+(sb-pb)⊏t           # Keep the first of each group that's not just inside a bracket
  sr←((≠t)↑/⁼sk)∨t≠sep⋄r↩sr/r⋄t↩sr/t        # Remove the rest
  𝕩↩t⋄nv←≠⊑k
  # End of tokenization!

  # Bracket roles
  # Open brackets initially have role ¯1 and closed ones have role 0
  gb←⍋+`p←(¯1-2×r)×(𝕩≥⊑bB)∧𝕩<+´bB           # Paren (actually any bracket type) depth and grade
  r↩r+𝕩=3+⊑bB                               # Assume blocks are functions
  cp←𝕩=1+⊑bB                                # Closed paren
  rp←(⍋gb)⊏(≠gb)»gb                         # Position of previous, for roles
  ir←((IT cp≤⊢)⊏⊢)(rp⊏0∾˜3=r)∨0<r           # Role of the expression ending at each position
  r↩r+cp×»ir                                # Roles at cp were 0; set them now

  # Reorder for parsing
  # Permutation to reverse each expression: *more* complicated than it looks
  rev←⍋+`¯1↓(¯1∾gb)((⍋⊣)⊏((⍋⊢)⊏⊣)¬⊏˜)⍋(+`+⊢)1∾gb⊏r=¯1
  gf←⍋fd←+`br←rev⊏p×𝕩∊⟨2,3⟩+⊑bB             # Order by brace depth fd to de-nest blocks
  rev↩gf⊏rev⋄fd↩gf⊏fd⋄br↩gf⊏br
  xv←rev⊏𝕩-vi                               # Save for lexical resolution
  g←⍋+`br-˜rev⊏p                            # Order by non-brace bracket depth
  gr←g⊏rev⋄gi←⍋g                            # Final parsing ordering
  b←br>0⋄bc←/br<0                           # Block Begin (mask) and Close (index), in matching order
  𝕩↩gr⊏𝕩⋄r↩gr⊏r⋄ir↩gr⊏ir

  # Parsing part 1
  a←¯3=r⋄ps←a<r<0                           # a: assignment, ps: part separator
  tr←ir⊏˜IT»ps                              # tr: train or modifier expression
  oa←⌽/op←r≥2⋄ro←op∨«op∧r=3                 # op: active modifiers; ro: mod or right operand
  xs←𝕩=sep⋄fo←𝕩=2+⊑bB                       # Separators, function open {
  ls←xs∧(IT fo)<IT lo←𝕩=4+⊑bB               # List Separators: after ⟨lo, not {fo
  ma←tr<(𝕩=1+⊑bG)∧«ir≥1                     # Modified assignment
  os←⌽(↕≠ro)(⊣-TT)⌽¬ro∨ma                   # Operator skip: distance rightward to derived function start
  at←1+(⊢+os⊏˜⊢)ai←/a                       # Assignment target
  ao←(⊑bG)-˜ai⊏𝕩+ma                         # Assignment opcode
  ak←gi⊏(«-⊢)(⍋⍷at∾↕≠𝕩)⊏(≠𝕩)↑1+ao           # Class of assignment: 1←? 2↩? 3+↩?
  aa←0<g⊏ac←»+`ak×1(»∨⊢)0=+`ak              # Broadcast ak to the entire target

  # Lexical resolution (independent of parsing part 2 below)
  id←/(0≤xv)∧xv<nv                          # Identifier indices in xv
  sp←/(¯3≤xv)∧xv<0                          # Special name indices
  d←1=id⊏ac                                 # Which accesses are definitions
  fi←+`b⋄fsc←3×fx←0∾1¨bc                    # Body index fi, immediacy ¬fx, special name count
  idf←id⊏fi⋄idv←id⊏xv                       # Function index and name ID
  dn←((df←d/idf)∾≠fx)⊔dv←d/idv              # Identifier name ID, per-block
  # Order every referenced identifier, and an undeclaration for each declaration
  ixf←(idf⊏¯1∾b/gf)∾df⊏(≠𝕩)∾1-˜bc⊏gf        # First order by block index, open for real and closed for virtual
  ig←((⍋(idv∾dv)⊏˜⊢)⊏⊢)⍋ixf                 # Then order by name
  ig↩(⊢/˜(≠d)>⊢)(⍋+`ig⊏d∾¯1¨dv)⊏ig          # Last order by declaration depth
  d↩ig⊏d⋄id↩ig⊏id
  ia←0<(id∾sp)⊏ac                           # Which are assignments
  idd←(⊢-(IT d)⊏⊢)id⊏fd                     # Identifier frame depth
  idi←((¯1+`d)⊏(⍋⍋d/ig)⊏⊢)(⊒+⊢⊏fsc˙)df      # Slot within frame
  spi←3+sp⊏xv                               # Special name index
  uu←(1«d)∧d((+`⊣)⊏1(∾/∾˜)(∨/⊣))0<idd       # Unused marker
  uu↩uu∾⌽∊⌽spi+6×sp⊏fi                      # ...for special names
  idor←∾3/⟨id∾sp⟩                           # Identifier bytecode ordering
  idoc←⟨32+uu(⊢+2×>)ia,idd∾0¨sp,idi∾spi⟩    # Identifier bytecode: instruction, depth, slot

  # Parsing part 2
  ta←tr∧2(>∨|)ps(⊢-TT)+`¬ro                 # Train argument (first-level)
  fa←/(ta∨ro∨(«∨⊢)ps<aa)<(r=1)∨»op          # Active functions
  dy←fa⊏«(tr∧r≥0)∨ro<r=0                    # Dyadic
  pr←𝕩⊏˜pi←/𝕩<sep⋄ob←pr⊏/¯1(⊢-»)u←⍷∧pr      # Objects to be loaded
  cn←pi∾lt←/𝕩≥cl←vi+nv⋄ob↩ob∾(cl-˜≠u)+lt⊏𝕩  # Constants
  bk←bc⊏gi                                  # Block loads
  lb←/𝕩=5+⊑bB                               # List starts
  ll←(¬lo/1«ps)+(⊢-»)1↓(lo∾1)/+`ls∾0        # List Length
  dr←/xs>ls⋄rt←/fo                          # Drop (block separator) and return
  fl←(⊢+dy×⊢⊏os˙)fa+dy                      # Function application site

  # Object code generation: numbers oc ordered by source location (after rev) oi
  or←⍋idor∾g⊏˜∾⟨cn,cn,bk,bk,2/lb,at,dr,oa+1⌈oa⊏os,fl,rt⟩
  oc←or⊏∾idoc∾⟨0¨cn,ob,1¨bk,1+↕≠bk,⥊⍉(11+lb⊏aa)≍ll,48+ao,6¨dr
               24+oa⊏r,16+dy+4×fa⊏tr,¯1↓rc←7¨fx⟩
  # Output
  fz←⟨0¨fx,¬fx,↕≠fx⟩                        # Per-function data
  cz←⟨/1∾or≥(≠oc)-≠rt,fsc+≠¨dn,dn,0¨¨dn⟩    # Per-body data
  ⟨oc∾¯1⊑rc,∾⟨u⊏𝕨⟩∾1↓k,<˘⍉>fz,<˘⍉>cz⟩       # Overall output
}