From 0c716e4c6b7c2c44bbfd02b6503cae66af7b7480 Mon Sep 17 00:00:00 2001 From: Marshall Lochbaum Date: Fri, 28 Jan 2022 16:34:41 -0500 Subject: Separate syntax highlighting category for header/body characters ;:? --- docs/implementation/kclaims.html | 48 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'docs/implementation/kclaims.html') diff --git a/docs/implementation/kclaims.html b/docs/implementation/kclaims.html index 26bbe7ca..d99d8ea4 100644 --- a/docs/implementation/kclaims.html +++ b/docs/implementation/kclaims.html @@ -41,35 +41,35 @@ [Cycles where a code fetch is stalled due to L1 instruction cache miss]

That's just the whole cost (in cycles) of L1 misses, exactly what we want! First I'll run this on a J program I have lying around, building my old Honors thesis with JtoLaTeX.

-
 Performance counter stats for 'jlatex document.jtex nopdf':
+
 Performance counter stats for 'jlatex document.jtex nopdf':
 
-     1,457,284,402      cycles:u
-        56,485,452      icache_16b.ifdata_stall:u
-         2,254,192      cache-misses:u
-        37,849,426      L1-dcache-load-misses:u
-        28,797,332      L1-icache-load-misses:u
+     1,457,284,402      cycles:u
+        56,485,452      icache_16b.ifdata_stall:u
+         2,254,192      cache-misses:u
+        37,849,426      L1-dcache-load-misses:u
+        28,797,332      L1-icache-load-misses:u
 
        0.557255985 seconds time elapsed
 

Here's the BQN call that builds CBQN's object code sources:

-
 Performance counter stats for './genRuntime /home/marshall/BQN/':
+
 Performance counter stats for './genRuntime /home/marshall/BQN/':
 
-       241,224,322      cycles:u
-         5,452,372      icache_16b.ifdata_stall:u
-           829,146      cache-misses:u
-         6,954,143      L1-dcache-load-misses:u
-         1,291,804      L1-icache-load-misses:u
+       241,224,322      cycles:u
+         5,452,372      icache_16b.ifdata_stall:u
+           829,146      cache-misses:u
+         6,954,143      L1-dcache-load-misses:u
+         1,291,804      L1-icache-load-misses:u
 
        0.098228740 seconds time elapsed
 

And the Python-based font tool I use to build font samples for this site:

-
 Performance counter stats for 'pyftsubset […more stuff]':
+
 Performance counter stats for 'pyftsubset […more stuff]':
 
-       499,025,775      cycles:u
-        24,869,974      icache_16b.ifdata_stall:u
-         5,850,063      cache-misses:u
-        11,175,902      L1-dcache-load-misses:u
-        11,784,702      L1-icache-load-misses:u
+       499,025,775      cycles:u
+        24,869,974      icache_16b.ifdata_stall:u
+         5,850,063      cache-misses:u
+        11,175,902      L1-dcache-load-misses:u
+        11,784,702      L1-icache-load-misses:u
 
        0.215698059 seconds time elapsed
 
@@ -84,13 +84,13 @@

So, roughly 4%, 2%, and 5%. The cache miss counts are also broadly in line with these numbers. Note that full cache misses are pretty rare, so that most misses just hit L2 or L3 and don't suffer a large penalty. Also note that instruction cache misses are mostly lower than data misses, as expected.

Don't get me wrong, I'd love to improve performance even by 2%. But it's not exactly world domination, is it? And it doesn't matter how cache-friendly K is, that's the absolute limit.

For comparison, here's ngn/k (which does aim for a small executable) running one of its unit tests—test 19 in the a20/ folder, chosen because it's the longest-running of those tests.

-
 Performance counter stats for '../k 19.k':
+
 Performance counter stats for '../k 19.k':
 
-     3,341,989,998      cycles:u
-        21,136,960      icache_16b.ifdata_stall:u
-           336,847      cache-misses:u
-        10,748,990      L1-dcache-load-misses:u
-        20,204,548      L1-icache-load-misses:u
+     3,341,989,998      cycles:u
+        21,136,960      icache_16b.ifdata_stall:u
+           336,847      cache-misses:u
+        10,748,990      L1-dcache-load-misses:u
+        20,204,548      L1-icache-load-misses:u
 
        1.245378356 seconds time elapsed
 
-- cgit v1.2.3