# The Markdown function is a markdown to html converter for a "good # enough" subset of Github-flavored markdown, as specified at # https://github.github.com/gfm/ . # Extensions are used whenever a source filename is given (mainly just # so testing won't use them). They: # - Add id= slugs to headers that match Githubs, for linking # - Adjust relative links to account for filename changes # - Highlight inline and block code as BQN # - Place code blocks in
 tags only, not 

# - Insert results into doubly-indented (8 spaces) code blocks
# - Add links to open and execute code in the REPL
# - Put a demo REPL in place of , for the main README

# Supports:
# - ATX headings (start with hashes #)
# - Paragraphs
# - Indented code blocks
# - Inline and raw HTML in a way that doesn't match the spec at all
# - Tables
# - Lists, unordered with single-line items only
# - Inlines: code fully, links partially, and emphasis somewhat

# Important missing features:
# - Thematic breaks like *** or ---
# - Setext headings (underlined with ==== or ----)
# - Fenced code blocks (marked off with ``` or ~~~)
# - Block quotes (start with >)
# - Strikethrough (~~text~~)
# - Images (like links)
# - Hard line breaks (trailing spaces or backslash)

# Here, a markdown file is represented as a list of its lines, which are
# strings (they don't include any line ending character).
# The html file is constructed directly as a string, using Html.

################################
# Utilities

# Linefeed
lf ← @+10

# Index of first zero, or number of leading 1s in a boolean list
Lead ← ⊑ ⊐⟜0

# 𝕨 is a list of lists. Find the first of these lists each cell of 𝕩
# belongs to.
FindGroup ← {
  i ← (∾𝕨) ⊐ 𝕩  # Index in all cells of 𝕨
  e ← +`≠¨𝕨     # Index past the end of each group of 𝕨
  e ⍋ i         # How many end-indices does each element pass?
}

# Count the number of consecutive true values up to the current element.
# To do this, subtract the index of the last false character from the
# current index.
CountRuns ← { (1+↕≠𝕩) (⊣ - ⌈`∘×) ¬𝕩 }

# 𝕩 is a string; return a mask of the characters that are escaped, that
# is, preceded by an odd number of backslashes (since a backslash can
# escape another backslash).
IsEscaped ← {
  » 2 | CountRuns 𝕩 = '\'
}

# Remove leading (∧`) and trailing (∧`⌾⌽) spaces
Trim ← { 𝕩 /˜ ¬ (∧` ∨ ∧`⌾⌽) ' '=𝕩 }

# Find whether 𝕨 was true at the last index where 𝕩 was false, in each
# position.
PrecedesGroup ← {
  # We prepend a 0 to 𝕨, so that 0 is the "before start" index, with a
  # false value, and normal indices are increased by 1.
  𝕨 ∾˜↩ 0
  inds ← 1 + ↕≠𝕩
  # Zero out indices where 𝕩 was true, and find the greatest index so
  # far at each position.
  last ← ⌈` inds × ¬𝕩
  last ⊏ 𝕨
}

# 𝕨 is a list of possible expression start indices in any order and 𝕩 is
# the corresponding endpoints. The expressions are mutually exclusive
# and do not nest, and are enabled in index order. Return a shape ·‿2
# array where the rows give the start and end of each enabled expression
# in index order.
Trace ← {
  # 𝕨 is a list with one index for each possible start, giving a later
  # start that is known to be enabled if that one is.
  # 𝕩 is a mask of all starts known to be enabled.
  # A "stop" position that follows all expressions tells when to stop.
  # At each step the distance from a start to its successor in 𝕨 is
  # doubled, so the maximum number of steps is about 2⋆⁼≠𝕩.
  En ← {
    𝕩 ↩ 1¨⌾((𝕩/𝕨)⊸⊏)𝕩  # Starts following from an enabled one are enabled
    𝕨 ↩ ⊏˜ 𝕨           # Double the number of steps in 𝕨
    𝕨 En 𝕩             # Repeat
  }⍟{0=¯1⊑𝕩}           #        until the stop is enabled

  g ← ⍋𝕨               # Order expressions by starting index
  start ← g⊏𝕨
  end   ← g⊏𝕩
  next ← start ⍋ end   # An expression's successor starts after it ends
  next ∾↩ ≠next        # The stop node is its own successor
  enabled ← ¯1 ↓ next En (≠next)↑1  # Search and remove the stop
  enabled / start≍˘end # List of enabled starts and ends
}

# Join lines with newline characters. Include a trailing newline.
JoinLines ← ∾ ∾⟜lf¨

# Given a list of begin-end pairs run together, return a list
Ranges ← {
  R ← {𝕨+↕𝕩¬𝕨}  # Single range
  𝕩 ↩ ∘‿2 ⥊ 𝕩   # Reshape into pairs
  ∾ R¨˝˘ 𝕩
}
# ∊⟜Ranges assuming 𝕨 is sorted
InRanges ← {
  𝕩 +↩ 2|↕≠𝕩  # Since ⍋ works with half-open intervals
  2 | 𝕩 ⍋ 𝕨
}

# Create an html node from a tag name and interior text.
Html ← {open 𝕊 contents:
  close ← (⊑open⊐" ") ↑ open
  ∾ ⟨"<",open,">" , contents , ""⟩
}

# Insert and remove things from the list 𝕩:
# - include is the mask of elements to keep in 𝕩
# - add is a list of lists to be inserted
# - pos is the list of positions where they should start
# Elements are added just after the given position in 𝕩, in the order
# they appear in ∾add.
Modify ← { ⟨include,add,pos⟩𝕊𝕩:
  ((/include)∾(≠¨add)/pos) ⍋⊸⊏ (include/𝕩)∾∾add
}

# URL encoding for links to the REPL
UTF32 ← {
  p ← 2⋆10
  s ← p×52+2+↕3  # Surrogate character boundaries
  h ← 1 = c ← (≠s)|s⍋𝕩
  h ¬⊸/ (»h×p×(2⋆6)+⊢)⊸+ 𝕩 - c⊏0∾s
}⍟(2=≠"𝕩")∘-⟜@
UTF8 ← ∾ (2⋆7) (⊣+(2⋆6){𝕨 ≤◶⟨⥊⊢-2×-⟜𝕗 ⋄ 𝕗(|∾˜(2÷˜⌊⟜𝕨)𝕊⌊∘÷˜)⊢⟩ 𝕩}¨) UTF32
Base64 ← {
  b64 ← Ranges "AZaz09++//"
  b←3|↕l←≠u←UTF8 𝕩
  M←((0<↕4)⥊˜≠)⊸× (1+0=b)⊸/
  v←(4⋆1+b) ((⌊∘÷˜) «⊸+○M (64÷⊣)×|) u
  (v⊏b64)∾(3|-l)⥊'='
}

# HTML and JS for demo REPL used in main README
repl ← 1↓"
""B Q N""
" loadRepl ← "" # Various URLs siteURL ← "https://mlochbaum.github.io/BQN/" tryURL ← siteURL∾"try.html#code=" repoURL ← "https://github.com/mlochbaum/BQN" blobURL ← repoURL∾"/blob/master/" # Environments _getCodeExec ← {𝕗⋄⍎} _getSvgExec ← {𝕗⋄⟨•path∾"svg.bqn","⍎"⟩ •EX •path∾"dzref"} ################################ Markdown ← {filename𝕊𝕩: extensions ← filename ≢ 0 path ← extensions◶""‿(⊢/˜·∨`⌾⌽'/'⊸=) filename CodeExec ← @_getCodeExec SvgExec ← @_getSvgExec ###### # First we classify each line based on the type of block it can start. ClassifyLine ← (0<≠)◶(0‿0)‿{ ind ← ⊑ lineChars FindGroup ⊏𝕩 getLen ← ind ⊑ lineClas∾⟨0˜⟩ l ← GetLen 𝕩 ⟨ind ∧ l>0 ⋄ l⟩ } # Character entity escaping # In order to use this with other modifications such as highlighting, # CharEntities returns a mask of characters to be escaped, and their # corresponding escapes. CharEntities ← {1¨⊸𝕊𝕩; # 𝕨 gives characters to potentially escape # The string gives escapes and their names, separated by spaces. # First split it on the first character. ce ← (1-˜¬×+`)∘=⟜⊑⊸⊔ " ""quot & gt" # Characters to escape are given first chars ← ⊑¨ce # HTML character entities start with & and end with ; entities ← ("&"∾∾⟜";")¨ 1↓¨ce # Replace a character if 𝕨 is not set and it's on our list. ind ← chars ⊐ 𝕩 useEntity ← 𝕨 ∧ ind < ≠chars ⟨¬ useEntity , entities ⊏˜ useEntity/ind , /useEntity⟩ } # Non-empty lines in code blocks have 4 leading spaces ProcCode ← { # Strip the leading spaces 𝕩 ↩ 4 ↓¨ 𝕩 code ← JoinLines 𝕩 # Highlight and unescape html-unsafe characters c‿ci ← extensions◶(2⥊<⟨⟩)‿GetHighlights code em‿e‿ei ← CharEntities code # If every line is indented by at least 4 additional spaces, we will # execute each one and insert the results. addRslt ← ∧´ ' ' = ∾ 4 (⌊⟜≠ ↑ ⊢)¨ 𝕩 # Don't show assignment results by default ShowRslt ← { depth ← +` "(){}⟨⟩" (⊣(≠⊸>ׯ1⋆2|⊢)⊐) 𝕩 𝕩 /˜↩ ¬ ∨`⌾⌽ (0=depth) ∧ 𝕩∊"⋄," # Just the last statement g ← 𝕩∊"←↩" (⊑g⊐1) (<⟜(≠g))◶⟨1,¬(" "∾∾idChars)∧´∘∊˜↑⟩ 𝕩 } r‿ri ← addRslt◶(2⥊<⟨⟩)‿{ ⟨ (ShowRslt ⊣◶⟨"",(⥊∾⟜lf⎉1)∘Fmt∘⊢⟩ CodeExec)⎊"ERROR"⍟(0<≠)¨ 𝕩 1 -˜ +` 1 + ≠¨ 𝕩 ⟩ # Don't forget the trailing newline } 𝕩 Link ← { c ← tryURL ∾ "&run" ∾˜ Base64 ¯1 ↓ JoinLines 4↓¨𝕩 ("a class=""replLink"" title=""Open in the REPL"" target=""_blank"" href="∾""""(∾∾⊣)c) Html "↗️" } mod ← ⟨em,e∾c∾r,ei∾ci∾ri⟩ Modify code 𝕩 Link⊸∾⍟addRslt "pre" Html "code" Html⍟(¬extensions) mod } # Headings start with #, and require 1-6 #s followed by a space. # Any trailing #s are ignored. LenHeading ← { n ← Lead 𝕩='#' l ← (0 » '\' = 𝕩 # Non-escaped bars 1 -˜ (¬r∨«b>r) × o + +` r } alignments ← (" align="""∾∾⟜"""")⍟(0<≠)¨ ""‿"right"‿"left"‿"center" ProcTable ← { rows ← (Trim¨ CutTableRow⊸⊔)¨ 𝕩 incl ← ¬ rule ← (∧´∾∊"-:"˙)¨ rows align ← alignments ⊏˜ (+˜⊸+´0‿¯1⊏⊢)¨ ':' = ⊑ rule / rows rows ↩ (((≠align)⌊≠)⊸↑ ProcInline¨)¨⌾(incl⊸/) rows rows ↩ (⊏rows) (⊢ ∾ ⟨""⟩ /˜ 0⌈-○≠)¨ rows rowType ← incl / +` rule # Head or body tags ← rowType ⊏ "th"‿"td" DoRow ← { lf ∾ JoinLines 𝕨 Html¨ 𝕩 } rows ↩ (<˘ tags ∾⌜ align) DoRow¨ incl/rows rowGroups ← ¯1 ↓ rowType ⊔○(∾⟜2) "tr"⊸Html¨ rows sections ← "thead"‿"tbody" Html⟜(lf ∾ JoinLines)¨ rowGroups "table" Html lf ∾ JoinLines (0 < ≠¨rowGroups) / sections } # Paragraphs ProcParagraph ← { "p" Html ProcInline ¯1 ↓ JoinLines Trim⌾(¯1⊸⊑) (Lead ' '⊸=)⊸↓¨ 𝕩 } # HTML blocks # Lazy rule: if it starts with < and contains >, it's probably HTML IsHtmlBlock ← (""⟩⊸≢◶GetRepl‿{ n←≠s←""⍷⊢)∘(⊑⟜𝕩) lastCommentEnd ← ¯1 comInd ← ∾ comments ← { lastCommentEnd ↩ end ← {𝕊⍟(¬EndsComment)1+𝕩}⍟(lastCommentEnd⊸<) 𝕩-1 𝕩 + ↕end¬𝕩 # A list of indices }¨ commentStart newBlock ← (≠𝕩)↑/⁼ ⊑¨ (0<≠¨)⊸/ comments lineType ↩ 5¨⌾(comInd⊸⊏) lineType lineDat ↩ 2¨⌾(comInd⊸⊏) lineDat # Lines that could be included in code blocks (will be refined) codeMask ← nonEmptyMask ∧ (lineType ≠ 5) ∧ blanks ≥ 4 paragraphMask ← 0 = lineType # A header can't have 4 spaces of indentation. If it doesn't become # part of a code block, it will be included in a paragraph. lineType -↩ codeMask ∧ 1 = lineType # Tables are made up of rows that would otherwise be paragraph rows. # They are indicated by the delimiter row, consisting of only a few # allowed characters, preceded (!) by a header row with the same # number of cells. IsTD ← (∧´ ∊ ∾ ⊣ ∊˜ 2↑⊢)⟜"-|: " tableMask ← (0⌾⊑ nonEmptyMask) ∧ paragraphMask ∧¬ codeMask tableDelimMask ← { 𝕩 IsTD¨∘⊣⌾(𝕨⊸/) 𝕨 }⟜𝕩 tableMask delimValid ← (⊢ =○(≠∘⊔∘CutTableRow¨ ⊏⟜𝕩) -⟜1) / tableDelimMask headerMask ← « delimValid⌾(tableDelimMask⊸/) 0¨𝕩 tableMask ↩ headerMask (⊢ ∧ ⊣ ∨ ⊣ PrecedesGroup <) tableMask lineType ↩ 3¨⌾(tableMask⊸/) lineType # Code blocks consist of indented lines, possibly with blank lines # in between. They must be separated from paragraphs by blank lines. codeMask ∧↩ ¬ paragraphMask PrecedesGroup codeMask codeMask ∨↩ codeMask (⊢ ∧ PrecedesGroup ∧ PrecedesGroup⌾⌽) lineType < 0 lineType ↩ 2¨⌾(codeMask⊸/) lineType # Lines continue blocks if they are part of the same multi-line # type as the previous line, and otherwise start new ones. # Headers (type 1) always start new blocks. newBlock ∨↩ 1 = lineType blockStart ← nonEmptyMask ∧ newBlock ∨ ¯1⊸»⊸≠ lineType # Headers and paragraphs ignore leading blanks. drop ← blanks × lineType < 2 # Group blocks based on blockStart, with type ¯1 lines excluded. blocks ← (1 -˜ (lineType ≥ 0) × +`blockStart) ⊔ drop ↓¨ 𝕩 # To process a block, pick the appropriate function from procFns. ProcBlock ← {t‿l G b: f←t⊑procFns ⋄ l F ⊑b } b ← (blockStart / lineType≍˘lineDat) <∘ProcBlock˘ blocks JoinLines b ∾ useRepl / ""‿loadRepl } ################################ # Testing # Uses the test cases at https://spec.commonmark.org/0.29/spec.json # since Github doesn't seem to have published theirs TestSections ← { tests ← ¯2 ↓˘ 8⊸(÷˜⟜≠∾⊣)⊸⥊2↓•LNS •path∾"spec.json" tests ↩ ((⊑2+⊐⟜':')¨∘⊏ ((-','=¯1⊑⊢)↓↓)¨⎉1 ⊢) tests testSection ← (1↓¯1↓⊢)¨ 5⊏˘tests UnEscape ← { EscapeChar ← { ("\""tn"⊐𝕩) ⊏ "\"""∾@+9‿10 } esc ← IsEscaped 𝕩 (¬«esc) / EscapeChar⌾(esc⊸/) 𝕩 } RunTest ← { in‿exp ← UnEscape∘(1↓¯1↓⊢)¨2↑𝕩 out ← 0 Markdown (@+10) ((⊢-˜¬×+`)∘=⊔⊢) in ⟨exp≡out,in,exp,out,2⊑𝕩⟩ } ignore ← (2 ⊏˘ tests) ∊ ⟨"47","85"⟩ res ← 1 ↓˘ (¬⊏˘)⊸/ RunTest˘ tests /˜ ignore < testSection ∊ 𝕩 res } ################################ # Syntax highlighting # Characters in identifiers. These are also used in ProcCode to detect # if a statement is an assignment. idChars ← ⟨ ('0'+↕10)∾"¯.π∞" "𝕣"∾˜'a'+↕26 'A'+↕26 "_" ⟩ # Return BQN highlights for an string 𝕩, as an ⟨add,pos⟩ list for Modify # (include will be all 1s). GetHighlights ← { # Characters used by BQN, and the HTML class they are associated with. classes‿chars ← <˘ ⍉ ∘‿2⥊⟨ 0 , " "∾@+9‿10 # Should never be highlighted "Value" , ¯1⊏˘5‿2⥊"𝕨𝕩𝕗𝕘𝕤"# Hack around UTF-16 "Function" , "+-×÷⋆√⌊⌈|¬∧∨<>≠=≤≥≡≢⊣⊢⥊∾≍↑↓↕«»⌽⍉/⍋⍒⊏⊑⊐⊒∊⍷⊔!"∾¯1⊏˘5‿2⥊"𝕎𝕏𝔽𝔾𝕊" "Modifier" , "˙˜˘¨⌜⁼´˝`" "Modifier2" , "∘○⊸⟜⌾⊘◶⎉⚇⍟⎊" "Number" , ∾idChars # Will be classified among ↑↑ later "Gets" , "←⇐↩→" "Paren" , "()" "Bracket" , "⟨⟩" "Brace" , "{}" "Ligature" , "‿" "Nothing" , "·" "Separator" , "⋄," "String" , "'""@" "Comment" , "#" ⟩ # Turn non-whitespace classes into ⟨open,close⟩ html tags. classTag ← ""‿"" ∾ > {⟨"",""⟩}¨ 1↓classes # Find each character's group, sending unknowns to 1. col ← (≠chars) (⊢--⟜1×=) chars FindGroup 𝕩 # Locate comments and strings. c ← 𝕩='#' le← /(𝕩=lf)∾1 # Line endings (le) end every comment (/c) on the line, so take a copy # for each # before that line but not the previous. ce← le /˜ -⟜» c/⊸⍋le # A single quote can only be used if there's another two places down. s ← /0‿0⊸«⊸∧𝕩=''' d ← /𝕩='"' css ← ⟨ s ⋄ ¯1↓d ⋄ /c ⟩ # Comment or string start cse ← ⟨ 2+s ⋄ 1↓d ⋄ ce ⟩ # Corresponding end indices # Now b is a table of (start,end) pairs b ← css Trace○∾ cse # Given a list of pairs, get a mask indicating included regions ToMask ← (≠`∨⊢) (≠𝕩)↑/⁼∘∾ # Split rows and group into text‿comments tc ← ((⊏˘b)⊏c) 2{𝕗↑⊔○(∾⟜𝕗)} <˘b # Color with "String" and "Comment" col ⌈↩ +´ (2‿1-˜≠classes) × ToMask¨ tc # UTF-16 hack: first half of a special name needs to match the second col↩ («col) ⊣⌾((𝕩=⊑"𝕩")⊸/) col # Color numeric literals and identifiers id ← col=5 # ←→ 𝕩∊idChars w ← »⊸< id # Word (identifier or number) beginning mask wt ← idChars FindGroup w/𝕩 # Type based on first character wt+↩ '_' = («⊸ (/start) {𝕨⊸+⌾(1⊸⊑)𝕩}⟜GetHighlights¨ ⊢) groups } ################################ # Format an array to a character matrix # Won't work on functions until we can catch errors fmtm ← ⟨•path∾"src/fmt.bqn"⟩ •EX •path∾"dzref" Fmt ← ⍕ _fmtm ################################ # Creating HTML files ConvertFile ← { MatchStart‿MatchEnd ← { ≤○≠◶0‿(⊣ ≡ (𝕩×≠)⊸↑) }¨ 1‿¯1 # Input file 𝕩 should be markdown; output file is fileout ! ".md" MatchEnd 𝕩 fileout ← ".html" ∾˜ (¯6⊸↓∾"index"˙)⍟("README"⊸MatchEnd) ¯3↓𝕩 # Contents of file to convert md ← •LNS •path∾𝕩 # Verify and remove the html link line: the output *is* the html file. ! ("*View this file"⊸MatchStart ∧ (siteURL∾fileout∾").*")⊸MatchEnd) ⊑md out ← 𝕩 Markdown 2↓md parts ← (1-˜·(¬×1++`)'/'⊸=)⊸⊔ (⊑⊐⟜".")⊸↑ 𝕩 root ← ⊑ up ← ⥊∘/⟜≍⟜"../"¨ ⌽↕≠parts isInd ← "README" ≡ ¯1⊑parts RQ ← {'"'¨⌾(('''=𝕩)⊸/)𝕩} Link ← RQ {∾⟨""⟩} head ← "head" Html lf∾JoinLines " "⊸∾¨⟨ "shortcut icon' type='image/x-icon" Link "favicon.ico" "stylesheet" Link "style.css" "title" Html "BQN: " ∾⍟(¬·∨´"BQN"⍷⊢) 2↓⊑(2≤≠)◶0‿("# "≡2⊸↑)¨⊸/md ⟩ repo ← "a href='"‿repoURL‿"'" ∾⊸Html "BQN" crumbs ← up ("a href='"∾∾⟜"index.html'")⊸Html¨○((-isInd)⊸↓) (<"main")»parts nav ← RQ "div class='nav'" Html 3↓∾ " / "⊸∾¨ repo <⊸∾ crumbs front ← head ∾○(∾⟜lf) nav ("docs/"∾fileout) •LNS front ∾ out }