From 87039f9340934cd750e64a937138c0faf59895ee Mon Sep 17 00:00:00 2001 From: Github Actions Date: Fri, 26 Apr 2024 06:26:10 +0000 Subject: [PATCH] Automated publish --- blog/2022-12-08-progress-update/index.html | 2 +- .../index.html | 106 +++++++------- blog/2023-04-07-ray-tracing/index.html | 122 ++++++++-------- blog/2023-07-08-object-model/index.html | 136 +++++++++--------- blog/2023-12-17-module-loading/index.html | 18 +-- blog/2023-12-30-fast-string/index.html | 14 +- blog/2024-02-23-bindings/index.html | 2 +- blog/feed.xml | 2 +- 8 files changed, 201 insertions(+), 201 deletions(-) diff --git a/blog/2022-12-08-progress-update/index.html b/blog/2022-12-08-progress-update/index.html index e96486c..aa664bf 100644 --- a/blog/2022-12-08-progress-update/index.html +++ b/blog/2022-12-08-progress-update/index.html @@ -25,7 +25,7 @@ // to read, but the shape of the code is the same. object_ptr if_result; object_ptr even_result{ even_QMARK->call(n) }; -if(jank::runtime::detail::truthy(even_result)) +if(jank::runtime::detail::truthy(even_result)) { if_result = kw_foo; } else { if_result = kw_bar; } diff --git a/blog/2023-01-13-optimizing-sequences/index.html b/blog/2023-01-13-optimizing-sequences/index.html index 69f7591..9a7ebd8 100644 --- a/blog/2023-01-13-optimizing-sequences/index.html +++ b/blog/2023-01-13-optimizing-sequences/index.html @@ -1,4 +1,4 @@ -jank development update - Optimizing sequences
jank development update - Optimizing sequences
Jan 13, 2023 · Jeaye Wilkerson

In this episode of jank's development updates, we follow an exciting few weekends as I was digging deep into Clojure's sequence implementation, building jank's equivalent, and then benchmarking and profiling in a dizzying race to the bottom.

Introduction

Not expecting a rabbit hole, I was originally surprised at how many allocations are involved in a normal sequence iteration in Clojure and thought to optimize that in jank. In fact, Clojure allocates a new sequence for every element over which it iterates!

Clojure's interface for sequences looks like this (link):

public interface ISeq extends IPersistentCollection
+jank development update - Optimizing sequences
jank development update - Optimizing sequences
Jan 13, 2023 · Jeaye Wilkerson

In this episode of jank's development updates, we follow an exciting few weekends as I was digging deep into Clojure's sequence implementation, building jank's equivalent, and then benchmarking and profiling in a dizzying race to the bottom.

Introduction

Not expecting a rabbit hole, I was originally surprised at how many allocations are involved in a normal sequence iteration in Clojure and thought to optimize that in jank. In fact, Clojure allocates a new sequence for every element over which it iterates!

Clojure's interface for sequences looks like this (link):

public interface ISeq extends IPersistentCollection
 {
   /* Returns the current front element of the sequence. */
   Object first();
@@ -15,12 +15,12 @@
   if(i + 1 < v.count())
   { return new APersistentVector.Seq(v, i + 1); }
   return null;
-}

This really surprised me, and I figured there must be a lot of cases where a sequence is only referenced in one place, so it can be changed in place in order to avoid allocations. This could potentially save millions of allocations in a typical program. For example, with something like:

(apply str [1 2 3 4 5 6 7 8 9 10])

The exact APersistenVector.Seq from above will be used here, resulting in 10 allocations as apply iterates through the sequence to build the arguments for str. So I built something like that in jank's sequence API. It looks like this:

struct sequence : virtual object, seqable
+}

This really surprised me, and I figured there must be a lot of cases where a sequence is only referenced in one place, so it can be changed in place in order to avoid allocations. This could potentially save millions of allocations in a typical program. For example, with something like:

(apply str [1 2 3 4 5 6 7 8 9 10])

The exact APersistenVector.Seq from above will be used here, resulting in 10 allocations as apply iterates through the sequence to build the arguments for str. So I built something like that in jank's sequence API. It looks like this:

struct sequence : virtual object, seqable
 {
-  using sequence_ptr = detail::box_type<sequence>;
+  using sequence_ptr = detail::box_type<sequence>;
 
-  virtual object_ptr first() const = 0;
-  virtual sequence_ptr next() const = 0;
+  virtual object_ptr first() const = 0;
+  virtual sequence_ptr next() const = 0;
   /* Each call to next() allocates a new sequence_ptr, since it's polymorphic. When iterating
    * over a large sequence, this can mean a _lot_ of allocations. However, if you own the
    * sequence_ptr you have, typically meaning it wasn't a parameter, then you can mutate it
@@ -28,7 +28,7 @@
    *
    * If you don't own your sequence_ptr, you can call next() on it once, to get one you
    * do own, and then next_in_place() on that to your heart's content. */
-  virtual sequence_ptr next_in_place() = 0;
+  virtual sequence_ptr next_in_place() = 0;
 
   /* Note, no cons here, since that's not implemented yet. */
 };

The usage of next_in_place for all sequence traversals in jank meant that, at most, one allocation was needed for an iteration of any length. In jank's case, that meant the same (apply str [1 2 3 4 5 6 7 8 9 10]) went from 32 sequence allocations to only 3.

That's a huge win. Right?

The rabbit hole

So then I benchmarked. How long does jank take to apply that same vector of numbers to str? How much did I save?

jank

Note, this benchmark fn in jank is using nanobench. Since jank doesn't have working macros yet, the benchmark also includes invoking the function, which is not the case for Clojure.

(benchmark "apply"
@@ -40,7 +40,7 @@
 ;    Execution time lower quantile : 923.094673 ns ( 2.5%)
 ;    Execution time upper quantile : 987.172459 ns (97.5%)
 ;                    Overhead used : 14.193132 ns

Oh no. Clojure takes about 939 ns, while jank, even with the optimized interface, takes 6,191 ns. We're not even close!

Profile, change, benchmark, repeat

Firstly, let's compare the actual code being benchmarked here.

Generated code

Clojure

There is an excellent tool, which has proved useful so many times during jank's development, called clojure-goes-fast/clj-java-decompiler. With just the following:

user=> (require '[clj-java-decompiler.core :refer [decompile]])
-user=> (decompile (apply str [1 2 3 4 5 6 7 8 9 10]))

We get:

public class cjd__init
+user=> (decompile (apply str [1 2 3 4 5 6 7 8 9 10]))

We get:

public class cjd__init
 {
   public static final Var const__0;
   public static final Var const__1;
@@ -66,70 +66,70 @@
   {
     __init0();
     // A bit more redacted here ...
-}

So, to understand this, note that our expression (apply str [1 2 3 4 5 6 7 8 9 10]) was turned into a Java class. The constants for apply and str, which are vars, were lifted, and our vector constant was also lifted. Those are the three const__ members of the class, which are statically initialized. The actual code which does our apply is in load. We can see, it basically does the following, if we sanitize the lifted constants:

apply.getRawRoot().invoke(str.getRawRoot(), vec);

Clojure's generated code seems optimal. The vars and vector are both lifted and the load function only gets their roots and invokes (roots can't reasonably be cached, especially during interactive programming, since vars can be redefined at any time, including from other threads). Let's see what jank is generating for this, to ensure it's equally optimized.

jank

struct gen166 : jank::runtime::object,
-                jank::runtime::pool_item_base<gen166>,
-                jank::runtime::behavior::callable,
-                jank::runtime::behavior::metadatable {
+}

So, to understand this, note that our expression (apply str [1 2 3 4 5 6 7 8 9 10]) was turned into a Java class. The constants for apply and str, which are vars, were lifted, and our vector constant was also lifted. Those are the three const__ members of the class, which are statically initialized. The actual code which does our apply is in load. We can see, it basically does the following, if we sanitize the lifted constants:

apply.getRawRoot().invoke(str.getRawRoot(), vec);

Clojure's generated code seems optimal. The vars and vector are both lifted and the load function only gets their roots and invokes (roots can't reasonably be cached, especially during interactive programming, since vars can be redefined at any time, including from other threads). Let's see what jank is generating for this, to ensure it's equally optimized.

jank

struct gen166 : jank::runtime::object,
+                jank::runtime::pool_item_base<gen166>,
+                jank::runtime::behavior::callable,
+                jank::runtime::behavior::metadatable {
   // Some bits redacted ...
 
-  jank::runtime::context &__rt_ctx;
-  jank::runtime::var_ptr const str155;
-  jank::runtime::var_ptr const apply154;
-  jank::runtime::object_ptr const const165;
-  jank::runtime::object_ptr const const164;
-  jank::runtime::object_ptr const const163;
-  jank::runtime::object_ptr const const162;
-  jank::runtime::object_ptr const const161;
-  jank::runtime::object_ptr const const160;
-  jank::runtime::object_ptr const const159;
-  jank::runtime::object_ptr const const158;
-  jank::runtime::object_ptr const const157;
-  jank::runtime::object_ptr const const156;
+  jank::runtime::context &__rt_ctx;
+  jank::runtime::var_ptr const str155;
+  jank::runtime::var_ptr const apply154;
+  jank::runtime::object_ptr const const165;
+  jank::runtime::object_ptr const const164;
+  jank::runtime::object_ptr const const163;
+  jank::runtime::object_ptr const const162;
+  jank::runtime::object_ptr const const161;
+  jank::runtime::object_ptr const const160;
+  jank::runtime::object_ptr const const159;
+  jank::runtime::object_ptr const const158;
+  jank::runtime::object_ptr const const157;
+  jank::runtime::object_ptr const const156;
 
-  gen166(jank::runtime::context &__rt_ctx)
+  gen166(jank::runtime::context &__rt_ctx)
       : __rt_ctx{__rt_ctx},
         str155{__rt_ctx.intern_var("clojure.core", "str").expect_ok()},
         apply154{__rt_ctx.intern_var("clojure.core", "apply").expect_ok()},
-        const165{jank::runtime::make_box<jank::runtime::obj::integer>(10)},
-        const164{jank::runtime::make_box<jank::runtime::obj::integer>(9)},
-        const163{jank::runtime::make_box<jank::runtime::obj::integer>(8)},
-        const162{jank::runtime::make_box<jank::runtime::obj::integer>(7)},
-        const161{jank::runtime::make_box<jank::runtime::obj::integer>(6)},
-        const160{jank::runtime::make_box<jank::runtime::obj::integer>(5)},
-        const159{jank::runtime::make_box<jank::runtime::obj::integer>(4)},
-        const158{jank::runtime::make_box<jank::runtime::obj::integer>(3)},
-        const157{jank::runtime::make_box<jank::runtime::obj::integer>(2)},
-        const156{jank::runtime::make_box<jank::runtime::obj::integer>(1)}
+        const165{jank::runtime::make_box<jank::runtime::obj::integer>(10)},
+        const164{jank::runtime::make_box<jank::runtime::obj::integer>(9)},
+        const163{jank::runtime::make_box<jank::runtime::obj::integer>(8)},
+        const162{jank::runtime::make_box<jank::runtime::obj::integer>(7)},
+        const161{jank::runtime::make_box<jank::runtime::obj::integer>(6)},
+        const160{jank::runtime::make_box<jank::runtime::obj::integer>(5)},
+        const159{jank::runtime::make_box<jank::runtime::obj::integer>(4)},
+        const158{jank::runtime::make_box<jank::runtime::obj::integer>(3)},
+        const157{jank::runtime::make_box<jank::runtime::obj::integer>(2)},
+        const156{jank::runtime::make_box<jank::runtime::obj::integer>(1)}
   { }
 
-  jank::runtime::object_ptr call() const override
+  jank::runtime::object_ptr call() const override
   {
-    using namespace jank;
-    using namespace jank::runtime;
+    using namespace jank;
+    using namespace jank::runtime;
     object_ptr call167;
     {
-      auto const &vec168(jank::runtime::make_box<jank::runtime::obj::vector>(
+      auto const &vec168(jank::runtime::make_box<jank::runtime::obj::vector>(
           const156, const157, const158, const159, const160, const161, const162,
           const163, const164, const165));
-      call167 = jank::runtime::dynamic_call(apply154->get_root(), str155->get_root(), vec168);
+      call167 = jank::runtime::dynamic_call(apply154->get_root(), str155->get_root(), vec168);
     }
     return call167;
   }
-};

The outline here is similar. jank generates a struct from the expression. We have constants lifted to members, and we initialize those in the struct's constructor. Then we have a call function which does our work. But, looking at our call function, we can see it's creating our vector, too; jank only lifted the numbers, not the whole vector! Let's change that.

The changes: 2a8014dfae6e57273983cee8f2c7f78a2be7fe73
ns/opop/serr%ins/opbranch/opmiss%totalbenchmark
4,671.71214,054.610.2%23,798.026,721.000.3%0.03apply

Nice! We've gone from 6,191 ns to 4,671 ns by ensuring we lift the vector out. Our generated call function just looks like this now:

jank::runtime::object_ptr call() const override
+};

The outline here is similar. jank generates a struct from the expression. We have constants lifted to members, and we initialize those in the struct's constructor. Then we have a call function which does our work. But, looking at our call function, we can see it's creating our vector, too; jank only lifted the numbers, not the whole vector! Let's change that.

The changes: 2a8014dfae6e57273983cee8f2c7f78a2be7fe73
ns/opop/serr%ins/opbranch/opmiss%totalbenchmark
4,671.71214,054.610.2%23,798.026,721.000.3%0.03apply

Nice! We've gone from 6,191 ns to 4,671 ns by ensuring we lift the vector out. Our generated call function just looks like this now:

jank::runtime::object_ptr call() const override
 {
-  using namespace jank;
-  using namespace jank::runtime;
-  object_ptr call169 = jank::runtime::dynamic_call
+  using namespace jank;
+  using namespace jank::runtime;
+  object_ptr call169 = jank::runtime::dynamic_call
   (
     apply154->get_root(),
     str155->get_root(),
     const166
   );
   return call169;
-}

Very similar to the generated Clojure load function! But still over 4x slower. We know the generated code is good, so let's dig deeper into what's going on when we call these functions.

Sequence lengths

If we follow how apply works on the C++ side, it looks like this:

object_ptr apply_to(object_ptr const &source, object_ptr const &args)
+}

Very similar to the generated Clojure load function! But still over 4x slower. We know the generated code is good, so let's dig deeper into what's going on when we call these functions.

Sequence lengths

If we follow how apply works on the C++ side, it looks like this:

object_ptr apply_to(object_ptr const &source, object_ptr const &args)
 {
   auto const &s(args->as_seqable()->seq());
-  auto const length(detail::sequence_length(s, max_params + 1));
+  auto const length(detail::sequence_length(s, max_params + 1));
   switch(length)
   {
     case 0:
@@ -140,7 +140,7 @@
       return dynamic_call(source, s->first(), s->next_in_place()->first());
     // more redacted ...
   }
-}

We need to know how many arguments we're calling the function with, by getting the sequence length, and then we build out the correct call accordingly. Clojure does the same thing here. Right now, detail::sequence_length is O(n), but our sequences know their length. Let's use that and add a Countable behavior to get an O(1) length check here. The new function looks like:

size_t sequence_length(behavior::sequence_ptr const &s, size_t const max)
+}

We need to know how many arguments we're calling the function with, by getting the sequence length, and then we build out the correct call accordingly. Clojure does the same thing here. Right now, detail::sequence_length is O(n), but our sequences know their length. Let's use that and add a Countable behavior to get an O(1) length check here. The new function looks like:

size_t sequence_length(behavior::sequence_ptr const &s, size_t const max)
 {
   if(s == nullptr)
   { return 0; }
@@ -168,19 +168,19 @@
              ret += seq->first()->to_string();
              for(auto it(seq->next_in_place()); it != nullptr; it = it->next_in_place())
              { ret += it->first()->to_string().data; }
-             __value = make_box(ret);")

But that means we need to allocate a new std::string for every argument, then concatenate that into our accumulator, which likely requires yet another allocation. Let's use fmt's string building to do this all in place. That means jank's base runtime object expands its interface to have two to_string functions:

struct object : virtual pool_item_common_base
+             __value = make_box(ret);")

But that means we need to allocate a new std::string for every argument, then concatenate that into our accumulator, which likely requires yet another allocation. Let's use fmt's string building to do this all in place. That means jank's base runtime object expands its interface to have two to_string functions:

struct object : virtual pool_item_common_base
 {
   // redacted ...
 
-  virtual detail::string_type to_string() const = 0;
-  virtual void to_string(fmt::memory_buffer &buffer) const;
+  virtual detail::string_type to_string() const = 0;
+  virtual void to_string(fmt::memory_buffer &buffer) const;
 
   // redacted ...
-};

If we look at the implementation of this for integer, we can see a neat usage of FMT_COMPILE. This allows us to compile our format string ahead of time, leading to very efficient rendering at run-time.

void integer::to_string(fmt::memory_buffer &buff) const
-{ fmt::format_to(std::back_inserter(buff), FMT_COMPILE("{}"), data); }

The changes: 819e1a178c3be549c894e9386e9dc54513800fe8
ns/opop/serr%ins/opbranch/opmiss%totalbenchmark
2,375.56420,952.590.3%10,751.023,070.000.2%0.01apply

From 2,533 ns to 2,375 ns.

Further sequence interface optimizations

To kick things off, we added the next_in_place function to the sequence interface, but there are two things which can easily be identified by looking at the previous apply_to snippet:

object_ptr apply_to(object_ptr const &source, object_ptr const &args)
+};

If we look at the implementation of this for integer, we can see a neat usage of FMT_COMPILE. This allows us to compile our format string ahead of time, leading to very efficient rendering at run-time.

void integer::to_string(fmt::memory_buffer &buff) const
+{ fmt::format_to(std::back_inserter(buff), FMT_COMPILE("{}"), data); }

The changes: 819e1a178c3be549c894e9386e9dc54513800fe8
ns/opop/serr%ins/opbranch/opmiss%totalbenchmark
2,375.56420,952.590.3%10,751.023,070.000.2%0.01apply

From 2,533 ns to 2,375 ns.

Further sequence interface optimizations

To kick things off, we added the next_in_place function to the sequence interface, but there are two things which can easily be identified by looking at the previous apply_to snippet:

object_ptr apply_to(object_ptr const &source, object_ptr const &args)
 {
   auto const &s(args->as_seqable()->seq());
-  auto const length(detail::sequence_length(s, max_params + 1));
+  auto const length(detail::sequence_length(s, max_params + 1));
   switch(length)
   {
     // redacted some ...
diff --git a/blog/2023-04-07-ray-tracing/index.html b/blog/2023-04-07-ray-tracing/index.html
index cb526af..26849bb 100644
--- a/blog/2023-04-07-ray-tracing/index.html
+++ b/blog/2023-04-07-ray-tracing/index.html
@@ -1,18 +1,18 @@
-jank development update - Optimizing a ray tracer
jank development update - Optimizing a ray tracer
Apr 07, 2023 · Jeaye Wilkerson

After the last post, which focused on optimizing jank's sequences, I wanted to get jank running a ray tracer I had previously written in Clojure. In this post, I document what was required to start ray tracing in jank and, more importantly, how I chased down the run time in a fierce battle with Clojure's performance.

Missing Clojure functions

Coming out of the last blog post, there were quite a few functions which the ray tracer required that jank did not yet have. A lot of this was tedium, but there are some interesting points.

Polymorphic arithmetic

In Clojure JVM, since everything can be an object, and Clojure's dynamically typed, we can't know what something like (+ a b) actually does. For example, it's possible that either a or b is not a number, but it's also possible that they're an unboxed long or double, or a boxed Long or a Double, or maybe even a BigInteger or Ratio. Each of these will handle + slightly differently. Clojure (and now jank) handles this using a neat polymorphic design. In jank, it starts with this number_ops interface:

struct number_ops
+jank development update - Optimizing a ray tracer
jank development update - Optimizing a ray tracer
Apr 07, 2023 · Jeaye Wilkerson

After the last post, which focused on optimizing jank's sequences, I wanted to get jank running a ray tracer I had previously written in Clojure. In this post, I document what was required to start ray tracing in jank and, more importantly, how I chased down the run time in a fierce battle with Clojure's performance.

Missing Clojure functions

Coming out of the last blog post, there were quite a few functions which the ray tracer required that jank did not yet have. A lot of this was tedium, but there are some interesting points.

Polymorphic arithmetic

In Clojure JVM, since everything can be an object, and Clojure's dynamically typed, we can't know what something like (+ a b) actually does. For example, it's possible that either a or b is not a number, but it's also possible that they're an unboxed long or double, or a boxed Long or a Double, or maybe even a BigInteger or Ratio. Each of these will handle + slightly differently. Clojure (and now jank) handles this using a neat polymorphic design. In jank, it starts with this number_ops interface:

struct number_ops
 {
   virtual number_ops const& combine(number_ops const&) const = 0;
   virtual number_ops const& with(integer_ops const&) const = 0;
   virtual number_ops const& with(real_ops const&) const = 0;
 
-  virtual object_ptr add() const = 0;
-  virtual object_ptr subtract() const = 0;
-  virtual object_ptr multiply() const = 0;
-  virtual object_ptr divide() const = 0;
-  virtual object_ptr remainder() const = 0;
-  virtual object_ptr inc() const = 0;
-  virtual object_ptr dec() const = 0;
+  virtual object_ptr add() const = 0;
+  virtual object_ptr subtract() const = 0;
+  virtual object_ptr multiply() const = 0;
+  virtual object_ptr divide() const = 0;
+  virtual object_ptr remainder() const = 0;
+  virtual object_ptr inc() const = 0;
+  virtual object_ptr dec() const = 0;
   /* ... and so on ... */
-};

jank then has different implementations of this interface, like integer_ops and real_ops. The trick here is to use the correct "ops" for the combination of left and right. By left and right, I mean, when looking at the expression (+ a b), we see the left side, a, and the right side, b. So if they're both integers, we can use the integer_ops, which returns more integers. But if one is an integer and the other is a real, we need to return a real. You can see this in Clojure, since (+ 1 2) is 3, but (+ 1 2.0) is 3.0.

The way this comes together is something like this:

object_ptr add(object_ptr const l, object_ptr const r)
+};

jank then has different implementations of this interface, like integer_ops and real_ops. The trick here is to use the correct "ops" for the combination of left and right. By left and right, I mean, when looking at the expression (+ a b), we see the left side, a, and the right side, b. So if they're both integers, we can use the integer_ops, which returns more integers. But if one is an integer and the other is a real, we need to return a real. You can see this in Clojure, since (+ 1 2) is 3, but (+ 1 2.0) is 3.0.

The way this comes together is something like this:

object_ptr add(object_ptr const l, object_ptr const r)
 { return with(left_ops(l), right_ops(r)).add(); }

You can see the Clojure source for this here and the jank source for this here.

Running the ray tracer

This ray tracer is a partial port of the very fun Ray tracing in one weekend project. It's not meant to be fast; it's a learning tool. However, it was a pure Clojure project I had lying around and seemed like a good next goal for jank. The scene is mostly random, but generally look like this:

The source code for both the jank and the Clojure versions are in this gist. They vary only in the math functions used; each one uses its host interop for them. Note that this is not the most idiomatic Clojure; it was written to work with the limitations of a previous iteration of jank, then somewhat upgraded. jank still doesn't support some idiomatic things like using keywords as functions, so there are many calls to get.

After giving that code a tolerant scan, let's take a look at the initial numbers.

First timing results

I'm ray tracing a tiny image. It's only 10x6 pixels. However, in order to create this tiny image, we need to cast 265 rays. Here's the image:

Now, the initial timing for this (using nanobench) for jank is here:
ms/opop/serr%ins/opbra/opmiss%totalbenchmark
797.491.252.1%4,864,568,466.00873,372,774.001.3%8.61ray

Just shy of 800 milliseconds. It's less than a second, yes, but it's also a trivially tiny image. Let's see how Clojure does with the same code (using criterium).

; (out) Evaluation count : 12 in 6 samples of 2 calls.
 ; (out)              Execution time mean : 69.441424 ms
 ; (out)     Execution time std-deviation : 8.195639 ms
@@ -23,23 +23,23 @@
    :g (* (get l :g) n)
    :b (* (get l :b) n)})

Let's see the jank code for this first. I've annotated it for readability.

/* Our function was turned into a struct which implements
    some jank interfaces. */
-struct vec3_scale579 : jank::runtime::object,
-                       jank::runtime::behavior::callable,
-                       jank::runtime::behavior::metadatable {
-  jank::runtime::context &__rt_ctx;
+struct vec3_scale579 : jank::runtime::object,
+                       jank::runtime::behavior::callable,
+                       jank::runtime::behavior::metadatable {
+  jank::runtime::context &__rt_ctx;
 
   /* Vars referenced within the fn are lifted to members.
      In this case, * and get. */
-  jank::runtime::var_ptr const _STAR_595;
-  jank::runtime::var_ptr const get596;
+  jank::runtime::var_ptr const _STAR_595;
+  jank::runtime::var_ptr const get596;
 
   /* Constants are lifted to members. */
-  jank::runtime::object_ptr const const600;
-  jank::runtime::object_ptr const const594;
-  jank::runtime::object_ptr const const598;
+  jank::runtime::object_ptr const const600;
+  jank::runtime::object_ptr const const594;
+  jank::runtime::object_ptr const const598;
 
   /* Constructor which initializes all lifted vars and constants. */
-  vec3_scale579(jank::runtime::context &__rt_ctx)
+  vec3_scale579(jank::runtime::context &__rt_ctx)
       : __rt_ctx{__rt_ctx},
         _STAR_595{__rt_ctx.intern_var("clojure.core", "*").expect_ok()},
         get596{__rt_ctx.intern_var("clojure.core", "get").expect_ok()},
@@ -49,45 +49,45 @@
   { }
 
   /* This is where the actual jank code we wrote runs. */
-  jank::runtime::object_ptr call
+  jank::runtime::object_ptr call
   (
-    jank::runtime::object_ptr l,
-    jank::runtime::object_ptr n
+    jank::runtime::object_ptr l,
+    jank::runtime::object_ptr n
   ) const override {
     /* First we call (* (get l :r) n). We can see the calls to get and * here. */
     object_ptr call607;
     {
       object_ptr call608;
-      { call608 = jank::runtime::dynamic_call(get596->get_root(), l, const594); }
-      call607 = jank::runtime::dynamic_call(_STAR_595->get_root(), call608, n);
+      { call608 = jank::runtime::dynamic_call(get596->get_root(), l, const594); }
+      call607 = jank::runtime::dynamic_call(_STAR_595->get_root(), call608, n);
     }
 
     /* Same thing for calling (* (get l :g) n). */
     object_ptr call609;
     {
       object_ptr call610;
-      { call610 = jank::runtime::dynamic_call(get596->get_root(), l, const598); }
-      call609 = jank::runtime::dynamic_call(_STAR_595->get_root(), call610, n);
+      { call610 = jank::runtime::dynamic_call(get596->get_root(), l, const598); }
+      call609 = jank::runtime::dynamic_call(_STAR_595->get_root(), call610, n);
     }
 
     /* Same thing for calling (* (get l :b) n). */
     object_ptr call611;
     {
       object_ptr call612;
-      { call612 = jank::runtime::dynamic_call(get596->get_root(), l, const600); }
-      call611 = jank::runtime::dynamic_call(_STAR_595->get_root(), call612, n);
+      { call612 = jank::runtime::dynamic_call(get596->get_root(), l, const600); }
+      call611 = jank::runtime::dynamic_call(_STAR_595->get_root(), call612, n);
     }
 
     /* Finally, we create a map from all of this and return it. */
     auto const map613
     (
-      jank::make_box<jank::runtime::obj::map>
-      (std::in_place, const594, call607, const598, call609, const600, call611)
+      jank::make_box<jank::runtime::obj::map>
+      (std::in_place, const594, call607, const598, call609, const600, call611)
     );
     return map613;
   }
 };

No big surprises there. Each var is dereferenced (with ->get_root()) when it's used, since vars can change at any time, from any thread. Let's see what Clojure generates for the same function.

/* Just like in jank, a class was generated for this function. */
-public final class core$vec3_scale extends AFunction
+public final class core$vec3_scale extends AFunction
 {
     /* Just like in jank, the constants were lifted to be members. */
     public static final Keyword const__0;
@@ -124,7 +124,7 @@
              (+ 1.0 ref-idx))
         r2 (* r r)]
     (* (+ r2 (- 1.0 r2))
-       (Math/pow (- 1.0 cosine) 5.0))))

Because (- 1.0 ref-idx) contains a double, Clojure can know that the whole expression will return a double. By then tracking how r is used, we can see if it requires boxing at all. In this case, r is only used with *, which doesn't require boxing. So r can actually just be a double instead of a Double. The same applies for everything else. Take a look at the generated code.

public final class core$reflectance extends AFunction
+       (Math/pow (- 1.0 cosine) 5.0))))

Because (- 1.0 ref-idx) contains a double, Clojure can know that the whole expression will return a double. By then tracking how r is used, we can see if it requires boxing at all. In this case, r is only used with *, which doesn't require boxing. So r can actually just be a double instead of a Double. The same applies for everything else. Take a look at the generated code.

public final class core$reflectance extends AFunction
 {
   public static Object invokeStatic(final Object cosine, final Object ref_idx)
   {
@@ -132,26 +132,26 @@
     final double r2 = r * r;
     return Numbers.multiply(r2 + (1.0 - r2), Math.pow(Numbers.minus(1.0, cosine), 5.0));
   }
-}

One last point about this: you may note that we're not boxing the double we're returning. The call to Numbers.multiply with two double inputs returns a double, but our function returns an Object. This works because Java supports auto-boxing and unboxing. In short, it will allow you to implicitly treat boxed and unboxed objects the same, injecting in the necessary code when it compiles. So, don't be fooled, the final return value here is boxed.

To do a similar thing in jank, I broke it into some steps:

Generate more type info

Changes: 3e18c1025f3a6db2028d55f819594208197e1a78

The goal here is to use boxed types (integer_ptr, keyword_ptr, etc) during codegen, whenever we have them, rather than just object_ptr everywhere. Also, whenever possible, use auto to allow the propagation of richer types.

Add boxed typed overloads for math fns

Changes: 083f08374dd371dacf6b854decbada83d336fbd6

Even without unboxing, if we can know we're adding a real_ptr and a real_ptr, for example, we can skip the polymorphic dance and get right at their internal data. We don't do that yet, here, but we add the right overloads.

Extend the codegen to convey, for any expression, whether a box is needed

Changes: 626680ecdb09d9007378e5f17bd06f8bcfd285ff

This sets the stage for unboxed math, if conditions, let bindings, etc.

Remove polymorphism from boxed math ops, when possible

Changes: 87715f9a711e0e0bd667afce221ccbe4a85b5501

This utilizes the typed overloads to optimize the calls where we have a typed box like integer_ptr or real_ptr.

Add unboxed math overloads

Changes: d6d97e9a58f6a40dcc425b8da80c9bd69faa0886

Wrapping everything together, this allows expressions like (/ 1.0 (+ n 0.5)) to avoid boxing, at least for the (+ n 0.5). Conditions for if don't require boxing, so something like (if (< y (/ 1.0 (+ n 0.5))) ...) wouldn't box at all.

I didn't implement unboxed let bindings, since that requires tracking binding usages to know if each one requires a box and I'm lazy. Still, let's see what we won.
ms/opop/serr%ins/opbra/opmiss%totalbenchmark
312.783.203.4%1,817,289,100.00350,618,740.001.4%3.39ray

Nice! That was a lot of work, but it gets us down from 559.31 ms to 312.78 ms.

Profile results

At this point, inspecting the generated code wasn't showing any clear opportunities, so I profiled again and looked for the primary culprits. They were:

  • get
  • mul, add, sub
  • map constructor

Hm, strikingly similar to the expectations set when we started. Sometimes it's unfortunate not to find surprises. Still, let's see what we can do about those maps.

Faster map

jank's array map implementation was using a vector of pairs. Note that both jank and Clojure distinguish between array maps and hash maps. Array maps are a specialization for short maps (i.e. few keys) which don't have the overhead of trees. Rather than O(log32 n) access with some constant overhead, they have O(n) access with very little overhead. When n is small enough, it ends up being faster. Once array maps get too big, they convert automatically into hash maps. For this ray tracer, all the maps are very small, so we're only looking at the array map implementation. jank's array map looked like this:

template <typename K, typename V>
-struct map_type_impl
+}

One last point about this: you may note that we're not boxing the double we're returning. The call to Numbers.multiply with two double inputs returns a double, but our function returns an Object. This works because Java supports auto-boxing and unboxing. In short, it will allow you to implicitly treat boxed and unboxed objects the same, injecting in the necessary code when it compiles. So, don't be fooled, the final return value here is boxed.

To do a similar thing in jank, I broke it into some steps:

Generate more type info

Changes: 3e18c1025f3a6db2028d55f819594208197e1a78

The goal here is to use boxed types (integer_ptr, keyword_ptr, etc) during codegen, whenever we have them, rather than just object_ptr everywhere. Also, whenever possible, use auto to allow the propagation of richer types.

Add boxed typed overloads for math fns

Changes: 083f08374dd371dacf6b854decbada83d336fbd6

Even without unboxing, if we can know we're adding a real_ptr and a real_ptr, for example, we can skip the polymorphic dance and get right at their internal data. We don't do that yet, here, but we add the right overloads.

Extend the codegen to convey, for any expression, whether a box is needed

Changes: 626680ecdb09d9007378e5f17bd06f8bcfd285ff

This sets the stage for unboxed math, if conditions, let bindings, etc.

Remove polymorphism from boxed math ops, when possible

Changes: 87715f9a711e0e0bd667afce221ccbe4a85b5501

This utilizes the typed overloads to optimize the calls where we have a typed box like integer_ptr or real_ptr.

Add unboxed math overloads

Changes: d6d97e9a58f6a40dcc425b8da80c9bd69faa0886

Wrapping everything together, this allows expressions like (/ 1.0 (+ n 0.5)) to avoid boxing, at least for the (+ n 0.5). Conditions for if don't require boxing, so something like (if (< y (/ 1.0 (+ n 0.5))) ...) wouldn't box at all.

I didn't implement unboxed let bindings, since that requires tracking binding usages to know if each one requires a box and I'm lazy. Still, let's see what we won.
ms/opop/serr%ins/opbra/opmiss%totalbenchmark
312.783.203.4%1,817,289,100.00350,618,740.001.4%3.39ray

Nice! That was a lot of work, but it gets us down from 559.31 ms to 312.78 ms.

Profile results

At this point, inspecting the generated code wasn't showing any clear opportunities, so I profiled again and looked for the primary culprits. They were:

  • get
  • mul, add, sub
  • map constructor

Hm, strikingly similar to the expectations set when we started. Sometimes it's unfortunate not to find surprises. Still, let's see what we can do about those maps.

Faster map

jank's array map implementation was using a vector of pairs. Note that both jank and Clojure distinguish between array maps and hash maps. Array maps are a specialization for short maps (i.e. few keys) which don't have the overhead of trees. Rather than O(log32 n) access with some constant overhead, they have O(n) access with very little overhead. When n is small enough, it ends up being faster. Once array maps get too big, they convert automatically into hash maps. For this ray tracer, all the maps are very small, so we're only looking at the array map implementation. jank's array map looked like this:

template <typename K, typename V>
+struct map_type_impl
 {
   /* Storing a vector of key/value pairs. */
-  using value_type = native_vector<std::pair<K, V>>;
-  using iterator = typename value_type::iterator;
-  using const_iterator = typename value_type::const_iterator;
+  using value_type = native_vector<std::pair<K, V>>;
+  using iterator = typename value_type::iterator;
+  using const_iterator = typename value_type::const_iterator;
 
   map_type_impl() = default;
-  map_type_impl(map_type_impl const &s) = default;
-  map_type_impl(map_type_impl &&s) noexcept = default;
-  map_type_impl(in_place_unique, value_type &&kvs)
-    : data{ std::move(kvs) }
+  map_type_impl(map_type_impl const &s) = default;
+  map_type_impl(map_type_impl &&s) noexcept = default;
+  map_type_impl(in_place_unique, value_type &&kvs)
+    : data{ std::move(kvs) }
   { }
   ~map_type_impl() = default;
 
   /* ... insert fns ... */
 
   /* Note the linear search. */
-  V find(K const &key) const
+  V find(K const &key) const
   {
     if(auto const kw = key->as_keyword())
     {
@@ -176,23 +176,23 @@
 
   value_type data;
   mutable size_t hash{};
-};

This is using folly's vector, a more optimized version of std::vector. Turns out it's still very slow to create, compared to just an array. I ended up benchmarking both vector types, std::array, and finally C arrays, each with pairs and without. C arrays, without pairs, clearly won. Since there are no pairs, keys and values are interleaved. Turns out this is exactly what Clojure does in PersistentArrayMap.java.

The new array map is quite similar:

template <typename KV>
-struct map_type_impl
+};

This is using folly's vector, a more optimized version of std::vector. Turns out it's still very slow to create, compared to just an array. I ended up benchmarking both vector types, std::array, and finally C arrays, each with pairs and without. C arrays, without pairs, clearly won. Since there are no pairs, keys and values are interleaved. Turns out this is exactly what Clojure does in PersistentArrayMap.java.

The new array map is quite similar:

template <typename KV>
+struct map_type_impl
 {
   /* Just a C array and a length. */
-  using value_type = KV*;
+  using value_type = KV*;
 
   map_type_impl() = default;
-  map_type_impl(map_type_impl const &s) = default;
-  map_type_impl(map_type_impl &&s) noexcept = default;
-  map_type_impl(in_place_unique, value_type const kvs, size_t const length)
+  map_type_impl(map_type_impl const &s) = default;
+  map_type_impl(map_type_impl &&s) noexcept = default;
+  map_type_impl(in_place_unique, value_type const kvs, size_t const length)
     : data{ kvs }, length{ length }
   { }
   ~map_type_impl() = default;
 
   /* ... insert fns ... */
 
-  KV find(KV const key) const
+  KV find(KV const key) const
   {
     if(auto const kw = key->as_keyword())
     {
@@ -215,26 +215,26 @@
   }
 
   /* Custom iteration is needed, due to the interleaving. */
-  struct iterator
+  struct iterator
   {
-    using iterator_category = std::input_iterator_tag;
-    using difference_type = std::ptrdiff_t;
-    using value_type = std::pair<KV, KV>;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = std::pair<KV, KV>;
+    using pointer = value_type*;
+    using reference = value_type&;
 
-    value_type operator *() const
+    value_type operator *() const
     { return { data[index], data[index + 1] }; }
-    iterator& operator ++()
+    iterator& operator ++()
     {
       index += 2;
       return *this;
     }
-    bool operator !=(iterator const &rhs) const
+    bool operator !=(iterator const &rhs) const
     { return data != rhs.data || index != rhs.index; }
-    bool operator ==(iterator const &rhs) const
+    bool operator ==(iterator const &rhs) const
     { return !(*this != rhs); }
-    iterator& operator=(iterator const &rhs)
+    iterator& operator=(iterator const &rhs)
     {
       if(this == &rhs)
       { return *this; }
@@ -247,7 +247,7 @@
     KV const* data{};
     size_t index{};
   };
-  using const_iterator = iterator;
+  using const_iterator = iterator;
 
   value_type data{};
   size_t length{};
diff --git a/blog/2023-07-08-object-model/index.html b/blog/2023-07-08-object-model/index.html
index 2c20405..19b467c 100644
--- a/blog/2023-07-08-object-model/index.html
+++ b/blog/2023-07-08-object-model/index.html
@@ -1,41 +1,41 @@
-jank development update - A faster object model
jank development update - A faster object model
Jul 08, 2023 · Jeaye Wilkerson

This quarter, my work on jank is being sponsored by Clojurists Together. The terms of the work are to research a new object model for jank, with the goal of making jank code faster across the board. This is a half-way report and I'm excited to share my results!

The problem

Before getting into any solutions, or celebrating any wins, we need to talk about why this work is being done at all. As you can see in my previous development updates, jank is fast. It can beat Clojure in each benchmark I've published so far. However, some parts of jank's runtime are still quite slow and, unfortunately, the problem is systemic.

Generally speaking, the problem can be boiled down to this: the JVM is ridiculously fast at allocations. That's important, since Clojure is, to put it nicely, very liberal with its allocations. jank overcomes this, in some ways, by just allocating a whole lot less. Still, each allocation pushes Clojure ahead in benchmarks and it adds up.

So, JVM allocations are fast, but why are jank's slow? To understand this requires an understanding of C++ inheritance and virtual function tables (vtables), so let's cover that at an implementation level.

Virtual function tables

Clojure is thoroughly polymorphic. Everything is an Object, which can then have any number of interfaces it implements, all of which can be extended, checked at run-time, etc. To accomplish this, in C++, I modeled the objects quite closely to how they are in Clojure's Java runtime. Let's take a look.

Let's take a stripped down jank base object:

struct jank_object : gc
-{ virtual std::string to_native_string() const = 0; };

Now let's define our boxed string object:

struct jank_string : jank_object
+jank development update - A faster object model
jank development update - A faster object model
Jul 08, 2023 · Jeaye Wilkerson

This quarter, my work on jank is being sponsored by Clojurists Together. The terms of the work are to research a new object model for jank, with the goal of making jank code faster across the board. This is a half-way report and I'm excited to share my results!

The problem

Before getting into any solutions, or celebrating any wins, we need to talk about why this work is being done at all. As you can see in my previous development updates, jank is fast. It can beat Clojure in each benchmark I've published so far. However, some parts of jank's runtime are still quite slow and, unfortunately, the problem is systemic.

Generally speaking, the problem can be boiled down to this: the JVM is ridiculously fast at allocations. That's important, since Clojure is, to put it nicely, very liberal with its allocations. jank overcomes this, in some ways, by just allocating a whole lot less. Still, each allocation pushes Clojure ahead in benchmarks and it adds up.

So, JVM allocations are fast, but why are jank's slow? To understand this requires an understanding of C++ inheritance and virtual function tables (vtables), so let's cover that at an implementation level.

Virtual function tables

Clojure is thoroughly polymorphic. Everything is an Object, which can then have any number of interfaces it implements, all of which can be extended, checked at run-time, etc. To accomplish this, in C++, I modeled the objects quite closely to how they are in Clojure's Java runtime. Let's take a look.

Let's take a stripped down jank base object:

struct jank_object : gc
+{ virtual std::string to_native_string() const = 0; };

Now let's define our boxed string object:

struct jank_string : jank_object
 {
-  std::string to_native_string() const override
+  std::string to_native_string() const override
   { return data; }
 
-  std::string data{};
-};

This is how each object is modeled in jank, currently, and it's mostly the same as how they are in Java. Each boxed string, hash map, vector, integer, etc inherits from a base object and overrides some functionality. We can use g++ -fdump-lang-class foo.cpp to put these sample types into a file and see the generated class hierarchy details. The output of that is long and confusing, though, so I've turned them into simpler diagrams. Let's take a look at jank_string.

So, jank_string is 40 bytes (8 for the jank_object vtable pointer + 32 for the std::string). It has its own static vtable and a vptr to it, since it inherits from jank_object and overrides a function. Whenever a jank_string is allocated, these vtable pointers need to be initialized. All of this is handled by the C++ compiler, and is implementation-defined, so we don't have much control over it.

Let's take this a step further and add another behavior, since I need to be able to get the size of a jank_string.

struct jank_countable
+  std::string data{};
+};

This is how each object is modeled in jank, currently, and it's mostly the same as how they are in Java. Each boxed string, hash map, vector, integer, etc inherits from a base object and overrides some functionality. We can use g++ -fdump-lang-class foo.cpp to put these sample types into a file and see the generated class hierarchy details. The output of that is long and confusing, though, so I've turned them into simpler diagrams. Let's take a look at jank_string.

So, jank_string is 40 bytes (8 for the jank_object vtable pointer + 32 for the std::string). It has its own static vtable and a vptr to it, since it inherits from jank_object and overrides a function. Whenever a jank_string is allocated, these vtable pointers need to be initialized. All of this is handled by the C++ compiler, and is implementation-defined, so we don't have much control over it.

Let's take this a step further and add another behavior, since I need to be able to get the size of a jank_string.

struct jank_countable
 { virtual size_t count() const = 0; };
 
-struct jank_string : jank_object, jank_countable
+struct jank_string : jank_object, jank_countable
 {
-  std::string to_native_string() const override
+  std::string to_native_string() const override
   { return data; }
 
   size_t count() const override
   { return data.size(); }
 
-  std::string data{};
-};

So now we add jank_countable into the mix and implement that for jank_string. What has this done to our vtables? Well, jank_countable needs its own vtable and jank_string is going to need a pointer to it.

Notice that jank_string was 40 bytes, but now it's 48 bytes, due to the additional pointer to the jank_countable vtable. It's important to note here that we didn't just make every string we allocate larger, which may slow down allocations, we also added another field to be initialized, which will certainly slow down allocations.

I'm sure you get the point, so let me wrap this section up by noting that Clojure's object model involves a lot of behaviors. Here's what jank's map object looks like right now:

struct map
-  : object,
-    behavior::seqable,
-    behavior::countable,
-    behavior::metadatable,
-    behavior::associatively_readable,
-    behavior::associatively_writable
-{ /* ... */ };

That's six vtable pointers and it covers maybe half of the functionality which Clojure's maps have. I just haven't implemented the rest yet. As I do, jank's maps will become slower and slower to allocate.

Garbage collectors

Before going further, I need to note that all of my Clojure benchmarking has been done on my local Linux desktop running OpenJDK 11 with the G1 GC. jank is currently using the Boehm GC, which is a conservative, non-moving GC that's super easy to use, but not the fastest on the market. More on this later, but note that jank has a lot of room to grow in terms of allocation speed by using a more tailored GC integration.

Initial numbers

By benchmarking the creation of non-empty hash maps ({:a :b} specifically), we can paint a pretty clear picture of the issue I've been describing.

For Clojure, it takes about 16ns to allocate. For jank, that number is nearly doubled to 31ns. So what can be done? Clojure depends on this level of polymorphism, and virtual functions are how you accomplish this in C++, so what else can we even do?

Static runtimes

Let's consider how a completely static runtime might be implemented. For example, let's assume I had a simple language which only supported a few object types, with no syntax for defining new types or protocols or even extending existing ones. This would often be implemented using something like a tagged union in C-like languages. Here's a quick example:

enum class object_type
+  std::string data{};
+};

So now we add jank_countable into the mix and implement that for jank_string. What has this done to our vtables? Well, jank_countable needs its own vtable and jank_string is going to need a pointer to it.

Notice that jank_string was 40 bytes, but now it's 48 bytes, due to the additional pointer to the jank_countable vtable. It's important to note here that we didn't just make every string we allocate larger, which may slow down allocations, we also added another field to be initialized, which will certainly slow down allocations.

I'm sure you get the point, so let me wrap this section up by noting that Clojure's object model involves a lot of behaviors. Here's what jank's map object looks like right now:

struct map
+  : object,
+    behavior::seqable,
+    behavior::countable,
+    behavior::metadatable,
+    behavior::associatively_readable,
+    behavior::associatively_writable
+{ /* ... */ };

That's six vtable pointers and it covers maybe half of the functionality which Clojure's maps have. I just haven't implemented the rest yet. As I do, jank's maps will become slower and slower to allocate.

Garbage collectors

Before going further, I need to note that all of my Clojure benchmarking has been done on my local Linux desktop running OpenJDK 11 with the G1 GC. jank is currently using the Boehm GC, which is a conservative, non-moving GC that's super easy to use, but not the fastest on the market. More on this later, but note that jank has a lot of room to grow in terms of allocation speed by using a more tailored GC integration.

Initial numbers

By benchmarking the creation of non-empty hash maps ({:a :b} specifically), we can paint a pretty clear picture of the issue I've been describing.

For Clojure, it takes about 16ns to allocate. For jank, that number is nearly doubled to 31ns. So what can be done? Clojure depends on this level of polymorphism, and virtual functions are how you accomplish this in C++, so what else can we even do?

Static runtimes

Let's consider how a completely static runtime might be implemented. For example, let's assume I had a simple language which only supported a few object types, with no syntax for defining new types or protocols or even extending existing ones. This would often be implemented using something like a tagged union in C-like languages. Here's a quick example:

enum class object_type
 {
   nil,
   string,
   integer
 };
 
-using nil_t = struct { };
-using string_t = char const *;
-using integer_t = long;
+using nil_t = struct { };
+using string_t = char const *;
+using integer_t = long;
 
-struct object
+struct object
 {
   /* Each object has a "tag", which is generally an enum. */
   object_type type;
@@ -49,90 +49,90 @@
   };
 };
 
-void print(object const &o)
+void print(object const &o)
 {
   switch(o.type)
   {
-    case object_type::nil:
-      fmt::print("nil");
+    case object_type::nil:
+      fmt::print("nil");
       break;
-    case object_type::string:
-      fmt::print("{}", o.string);
+    case object_type::string:
+      fmt::print("{}", o.string);
       break;
-    case object_type::integer:
-      fmt::print("{}", o.integer);
+    case object_type::integer:
+      fmt::print("{}", o.integer);
       break;
   }
-}

So, if you're not familiar how unions work, they just store all of the possible fields listed in the union in the same memory space. The union is as big as its largest field. The tag accompanies the union and informs you how to treat that memory (i.e. as a integer, string, etc). In order to access data from the union, we generally just use a switch statement on the tag.

The main drawback with this approach is that all possible types need to be known at compile-time, since they're part of the enum, the union, and each switch statement. However, the main benefit of this approach is the same. All types are known at compile-time, so compilers have everything they need to optimize access. There are no vtables, object allocations are all the same size, each function call can potentially be inlined, and so on.

A hybrid runtime

Clojure demands polymorphism, but it also has a well known set of static types. In fact, we model most of our programs just using Clojure's built-in data structures, so why not optimize for that case? The entirely open, polymorphic case doesn't need to negatively impact the average case.

This reasoning lead me to prototyping and benchmarking a tagged object model for jank. However, since jank is not a trivial language, the tagged implementation couldn't quite be as simple as my example above. There are a few key concerns.

Concern 1: Unions

Unions are very limiting. Even with jank's static objects, there is a large variety in object size. Requiring every integer, for example, to be as big as a hash map is not ideal. Numbers need to be fast to allocate and use.

Fortunately, C++ offers a great deal more power than C when it comes to compile-time polymorphism, in the form of templates, so we can take advantage of that. Let's see what that looks like:

enum class object_type
+}

So, if you're not familiar how unions work, they just store all of the possible fields listed in the union in the same memory space. The union is as big as its largest field. The tag accompanies the union and informs you how to treat that memory (i.e. as a integer, string, etc). In order to access data from the union, we generally just use a switch statement on the tag.

The main drawback with this approach is that all possible types need to be known at compile-time, since they're part of the enum, the union, and each switch statement. However, the main benefit of this approach is the same. All types are known at compile-time, so compilers have everything they need to optimize access. There are no vtables, object allocations are all the same size, each function call can potentially be inlined, and so on.

A hybrid runtime

Clojure demands polymorphism, but it also has a well known set of static types. In fact, we model most of our programs just using Clojure's built-in data structures, so why not optimize for that case? The entirely open, polymorphic case doesn't need to negatively impact the average case.

This reasoning lead me to prototyping and benchmarking a tagged object model for jank. However, since jank is not a trivial language, the tagged implementation couldn't quite be as simple as my example above. There are a few key concerns.

Concern 1: Unions

Unions are very limiting. Even with jank's static objects, there is a large variety in object size. Requiring every integer, for example, to be as big as a hash map is not ideal. Numbers need to be fast to allocate and use.

Fortunately, C++ offers a great deal more power than C when it comes to compile-time polymorphism, in the form of templates, so we can take advantage of that. Let's see what that looks like:

enum class object_type
 {
   nil,
   integer
 };
 
-template <object_type T>
-struct static_object;
+template <object_type T>
+struct static_object;
 
 template <>
-struct static_object<object_type::nil> : gc
+struct static_object<object_type::nil> : gc
 { };
 
 template <>
-struct static_object<object_type::integer> : gc
-{ native_integer data{}; };

Ok, let me break this down. We start with the same enum as with the static runtime example. Here I'm just showing nil and integer. Then, we have a new static_object struct template. It's parameterized on the object type. Note that templates can be parameterized on types as well as certain values. Here we're parameterizing on the enum value itself. We can specialize this template for each value of object_type and each one can be a completely distinct struct, with its own fields. However, they're all tied together by the combination of static_object and some enum value. This usage of templates is kind of like Clojure's multi-methods, but for compile-time types.

This is much more flexible than the union approach, since each object type has its own definition and size. The size of the integer specialization will be far smaller than the size of the map specialization.

However, the work isn't done yet.

Concerns 2 and 3: Type erasure and stable pointers

With the above static_object template, we can allocate an integer and it has its own strong, static type. However, to achieve Clojure's polymorphism, we need type erasure. For example, we need to be able to store any type of object in a vector, or as a key in a map. When using inheritance, we have a base object type for that. When using the union based approach, every object fits inside of a single object type. However, in our type-rich object model, each object type is discrete. We need a common way to refer to them, while still being able to get back to the static object. On top of that, we need a way to unerase the type, allowing us to get back to the original static object. This is Concern 2.

Also, Concern 3 is that the pointers we use to hang onto these objects need to be stable and they need to correspond with the pointers the GC gave us when we allocated them. This is because the GC is constantly scanning the process memory for references to those pointers; if we type-erase to some other pointer value and hang onto that, the GC may suspect nobody is referencing the original value anymore and take the liberty of freeing it.

We can solve both of these problems with the same addition: a simple object type which contains our object_type enum. If every static_object specialization has this object type as its first member, we can ensure that a pointer to the object member is the same value as a pointer to the static_object itself (and we can static_assert this to ensure padding doesn't bite us). With that knowledge, we can reinterpret any object pointer to be a static_object pointer, based on doing a switch on the object type. Here's how it would look:

enum class object_type
+struct static_object<object_type::integer> : gc
+{ native_integer data{}; };

Ok, let me break this down. We start with the same enum as with the static runtime example. Here I'm just showing nil and integer. Then, we have a new static_object struct template. It's parameterized on the object type. Note that templates can be parameterized on types as well as certain values. Here we're parameterizing on the enum value itself. We can specialize this template for each value of object_type and each one can be a completely distinct struct, with its own fields. However, they're all tied together by the combination of static_object and some enum value. This usage of templates is kind of like Clojure's multi-methods, but for compile-time types.

This is much more flexible than the union approach, since each object type has its own definition and size. The size of the integer specialization will be far smaller than the size of the map specialization.

However, the work isn't done yet.

Concerns 2 and 3: Type erasure and stable pointers

With the above static_object template, we can allocate an integer and it has its own strong, static type. However, to achieve Clojure's polymorphism, we need type erasure. For example, we need to be able to store any type of object in a vector, or as a key in a map. When using inheritance, we have a base object type for that. When using the union based approach, every object fits inside of a single object type. However, in our type-rich object model, each object type is discrete. We need a common way to refer to them, while still being able to get back to the static object. On top of that, we need a way to unerase the type, allowing us to get back to the original static object. This is Concern 2.

Also, Concern 3 is that the pointers we use to hang onto these objects need to be stable and they need to correspond with the pointers the GC gave us when we allocated them. This is because the GC is constantly scanning the process memory for references to those pointers; if we type-erase to some other pointer value and hang onto that, the GC may suspect nobody is referencing the original value anymore and take the liberty of freeing it.

We can solve both of these problems with the same addition: a simple object type which contains our object_type enum. If every static_object specialization has this object type as its first member, we can ensure that a pointer to the object member is the same value as a pointer to the static_object itself (and we can static_assert this to ensure padding doesn't bite us). With that knowledge, we can reinterpret any object pointer to be a static_object pointer, based on doing a switch on the object type. Here's how it would look:

enum class object_type
 {
   nil,
   integer
 };
 
 /* An object type which contain the enum value. */
-struct object
+struct object
 { object_type type{}; };
-using object_ptr = object*;
+using object_ptr = object*;
 
-template <object_type T>
-struct static_object;
+template <object_type T>
+struct static_object;
 
 /* Each specialization composes the object type as its first member. */
 template <>
-struct static_object<object_type::nil> : gc
-{ object base{ object_type::nil }; };
+struct static_object<object_type::nil> : gc
+{ object base{ object_type::nil }; };
 
 template <>
-struct static_object<object_type::integer> : gc
+struct static_object<object_type::integer> : gc
 {
-  object base{ object_type::integer };
+  object base{ object_type::integer };
   native_integer data{};
 };
 
-void print(object const &o)
+void print(object const &o)
 {
   switch(o.type)
   {
-    case object_type::nil:
-      fmt::print("nil");
+    case object_type::nil:
+      fmt::print("nil");
       break;
-    case object_type::integer:
+    case object_type::integer:
       /* We can cast right from the object pointer to the static_object pointer. */
-      auto const typed_o(reinterpret_cast<static_object<object_type::integer> const*>(&o));
-      fmt::print(typed_o->data);
+      auto const typed_o(reinterpret_cast<static_object<object_type::integer> const*>(&o));
+      fmt::print(typed_o->data);
       break;
   }
-}

This is the classic composition versus inheritance change. The previous version of jank's object model followed Clojure JVM's design of using inheritance. This new design uses composition, by having each static object have the base object as its first member.

Concern 4: Switch statements

Imagine if we had to write a switch statement everywhere we wanted polymorphism. In a simpler language that uses the classic tagged union approach, especially when written in C, this would typically just be the way things work. However, surely modern C++ has some more robust features for us to use instead? Indeed it does.

We can get around this duplication by having the switch in only one place and using the visitor pattern to access it. The result looks like this:

template <typename F>
-[[gnu::always_inline, gnu::flatten, gnu::hot]]
-inline void visit_object(object * const erased, F &&fn)
+}

This is the classic composition versus inheritance change. The previous version of jank's object model followed Clojure JVM's design of using inheritance. This new design uses composition, by having each static object have the base object as its first member.

Concern 4: Switch statements

Imagine if we had to write a switch statement everywhere we wanted polymorphism. In a simpler language that uses the classic tagged union approach, especially when written in C, this would typically just be the way things work. However, surely modern C++ has some more robust features for us to use instead? Indeed it does.

We can get around this duplication by having the switch in only one place and using the visitor pattern to access it. The result looks like this:

template <typename F>
+[[gnu::always_inline, gnu::flatten, gnu::hot]]
+inline void visit_object(object * const erased, F &&fn)
 {
   switch(erased->type)
   {
-    case object_type::nil:
+    case object_type::nil:
       fn(reinterpret_cast<static_nil*>(erased));
       break;
-    case object_type::integer:
+    case object_type::integer:
       fn(reinterpret_cast<static_integer*>(erased));
       break;
   }
 }
 
-void print(object const &o)
+void print(object const &o)
 {
   visit_object
   (
@@ -140,42 +140,42 @@
     /* Generic anonymous function. */
     [](auto const typed_o)
     {
-      using T = std::decay_t<decltype(typed_o)>;
+      using T = std::decay_t<decltype(typed_o)>;
 
-      if constexpr(std::same_as<T, static_nil*>)
-      { fmt::print("nil"); }
-      else if constexpr(std::same_as<T, static_integer*>)
-      { fmt::print("{}", typed_o->data); }
+      if constexpr(std::same_as<T, static_nil*>)
+      { fmt::print("nil"); }
+      else if constexpr(std::same_as<T, static_integer*>)
+      { fmt::print("{}", typed_o->data); }
     }
   );
-}

The vistor pattern here allows us to specify a generic lambda, which is basically shorthand for a function template which accepts any input. The anonymous function will be called with the fully typed static_object and we can use compile-time branching based on the type of the parameter to do the things we want. This means the most optimal code is generated and there's static type checking every step of the way, even in our polymorphic system.

The annotations above visit_object instruct the compiler to optimize all of this away. As I will show in just a bit, this is no challenge at all. The visitor pattern is not at all present in the generated binary.

I know that the if constexpr branching didn't save us any lines, compared to the switch, in the previous example. Hang tight while we address that.

Concern 5: Polymorphic behaviors

Finally, we hit our last concern. Objects in Clojure are polymorphic, but they can also be referred to by their own polymorphic behaviors. For example, in jank, we have behaviors for countable (for use with count), associatively_readable (which supplies access to get), etc. These aren't objects on their own; they're behaviors for objects. In typical OOP terms, they're interfaces which these objects implement. In a world with static objects and compile-time branching to visit them, how do we handle these behaviors?

Well, C++20 introduces an improved take on the idea of compile-time behaviors in what it calls concepts. So, let's define a concept for getting a string from an object. I like to end all of these behaviors with able, even when it doesn't grammatically work at all, as a cheeky jab at OOP.

template <typename T>
+}

The vistor pattern here allows us to specify a generic lambda, which is basically shorthand for a function template which accepts any input. The anonymous function will be called with the fully typed static_object and we can use compile-time branching based on the type of the parameter to do the things we want. This means the most optimal code is generated and there's static type checking every step of the way, even in our polymorphic system.

The annotations above visit_object instruct the compiler to optimize all of this away. As I will show in just a bit, this is no challenge at all. The visitor pattern is not at all present in the generated binary.

I know that the if constexpr branching didn't save us any lines, compared to the switch, in the previous example. Hang tight while we address that.

Concern 5: Polymorphic behaviors

Finally, we hit our last concern. Objects in Clojure are polymorphic, but they can also be referred to by their own polymorphic behaviors. For example, in jank, we have behaviors for countable (for use with count), associatively_readable (which supplies access to get), etc. These aren't objects on their own; they're behaviors for objects. In typical OOP terms, they're interfaces which these objects implement. In a world with static objects and compile-time branching to visit them, how do we handle these behaviors?

Well, C++20 introduces an improved take on the idea of compile-time behaviors in what it calls concepts. So, let's define a concept for getting a string from an object. I like to end all of these behaviors with able, even when it doesn't grammatically work at all, as a cheeky jab at OOP.

template <typename T>
 concept stringable = requires(T * const t)
 {
-  { t->to_string() } -> std::convertible_to<native_string>;
-};

C++20 concepts are just compile-time predicates, but they're quite flexible. This is a predicate for some type T that checks if you can call ->to_string() on an instance of it and get something compatible with a native_string. This is less specific than a C++ interface which says you need to implement something like virtual native_string to_string() const, since it allows returning references to strings, or something which can convert to a string.

Keep in mind that, while inheritance is intrusive, concepts are not. They're just predicates for types and are not coupled to any given type. This is analogous to the structural typing versus nominal typing discussion.

If we wanted to use this in our print function, we could just do:

void print(object const &o)
+  { t->to_string() } -> std::convertible_to<native_string>;
+};

C++20 concepts are just compile-time predicates, but they're quite flexible. This is a predicate for some type T that checks if you can call ->to_string() on an instance of it and get something compatible with a native_string. This is less specific than a C++ interface which says you need to implement something like virtual native_string to_string() const, since it allows returning references to strings, or something which can convert to a string.

Keep in mind that, while inheritance is intrusive, concepts are not. They're just predicates for types and are not coupled to any given type. This is analogous to the structural typing versus nominal typing discussion.

If we wanted to use this in our print function, we could just do:

void print(object const &o)
 {
   visit_object
   (
     &o,
     [](auto const typed_o)
     {
-      using T = std::decay_t<std::remove_pointer_t<decltype(typed_o)>>;
+      using T = std::decay_t<std::remove_pointer_t<decltype(typed_o)>>;
       /* Alternatively, I could `if constexpr` check here and
          do something else otherwise. */
       static_assert(stringable<T>, "Object must be stringable");
 
-      fmt::print("{}", typed_o->to_string());
+      fmt::print("{}", typed_o->to_string());
     }
   );
-}

Finally, let's wrap up with a more real world example. In Clojure, getting the length of a sequence can be an O(n) operation. However, some sequences may already know their length, or have it cached. In Clojure, there's a Counted interface for this; in jank, it's called countable. The old inheritance version of countable looked like this:

struct countable
+}

Finally, let's wrap up with a more real world example. In Clojure, getting the length of a sequence can be an O(n) operation. However, some sequences may already know their length, or have it cached. In Clojure, there's a Counted interface for this; in jank, it's called countable. The old inheritance version of countable looked like this:

struct countable
 {
   virtual ~countable() = default;
   virtual size_t count() const = 0;
-};

The concept for it would be very similar:

template <typename T>
+};

The concept for it would be very similar:

template <typename T>
 concept countable = requires(T * const t)
 {
-  { t->count() } -> std::convertible_to<size_t>;
-};

And we can conditionally use it when measuring a sequences length:

size_t sequence_length(object_ptr const s)
+  { t->count() } -> std::convertible_to<size_t>;
+};

And we can conditionally use it when measuring a sequences length:

size_t sequence_length(object_ptr const s)
 {
   if(s == nullptr)
   { return 0; }
@@ -185,7 +185,7 @@
     s,
     [](auto const typed_s)
     {
-      using T = std::decay_t<std::remove_pointer_t<decltype(typed_s)>>;
+      using T = std::decay_t<std::remove_pointer_t<decltype(typed_s)>>;
 
       if constexpr(countable<T>)
       { return c->count(); }
diff --git a/blog/2023-12-17-module-loading/index.html b/blog/2023-12-17-module-loading/index.html
index cf701e3..907bdf3 100644
--- a/blog/2023-12-17-module-loading/index.html
+++ b/blog/2023-12-17-module-loading/index.html
@@ -54,31 +54,31 @@
 |  |  |
 |  |  /* How many fixed arguments are required before the packed args? */
 |  /* Is there an ambiguous overload? */
-/* Is the function variadic? */

From there, when we use it, we disable the bit for question 2 and we switch on the rest. This allows us to do a O(1) jump on the combination of whether it's variadic and the required fixed args. Finally, we only need the question 2 bit to disambiguate one branch of each switch, which is the branch equal to however many arguments we received.

object_ptr dynamic_call(object_ptr const source, object_ptr const a1)
+/* Is the function variadic? */

From there, when we use it, we disable the bit for question 2 and we switch on the rest. This allows us to do a O(1) jump on the combination of whether it's variadic and the required fixed args. Finally, we only need the question 2 bit to disambiguate one branch of each switch, which is the branch equal to however many arguments we received.

object_ptr dynamic_call(object_ptr const source, object_ptr const a1)
 {
   return visit_object
   (
     [=](auto const typed_source) -> object_ptr
     {
-      using T = typename decltype(typed_source)::value_type;
+      using T = typename decltype(typed_source)::value_type;
 
-      if constexpr(function_like<T> || std::is_base_of_v<callable, T>)
+      if constexpr(function_like<T> || std::is_base_of_v<callable, T>)
       {
         /* This is the whole byte, answering all three questions. */
         auto const arity_flags(typed_source->get_arity_flags());
         /* We strip out the bit for ambiguous checking and switch on it. */
-        auto const mask(callable::extract_variadic_arity_mask(arity_flags));
+        auto const mask(callable::extract_variadic_arity_mask(arity_flags));
 
         /* We're matching on variadic + required arg position. */
         switch(mask)
         {
-          case callable::mask_variadic_arity(0):
-            return typed_source->call(make_box<obj::native_array_sequence>(a1));
-          case callable::mask_variadic_arity(1):
+          case callable::mask_variadic_arity(0):
+            return typed_source->call(make_box<obj::native_array_sequence>(a1));
+          case callable::mask_variadic_arity(1):
             /* Only in the case where the arg count == the required arity do we
                check the extra bit in the flags. */
-            if(!callable::is_variadic_ambiguous(arity_flags))
-            { return typed_source->call(a1, obj::nil::nil_const()); }
+            if(!callable::is_variadic_ambiguous(arity_flags))
+            { return typed_source->call(a1, obj::nil::nil_const()); }
             /* We're falling through! */
           default:
             /* The default case is not variadic. */
diff --git a/blog/2023-12-30-fast-string/index.html b/blog/2023-12-30-fast-string/index.html
index 38947d4..7bfc581 100644
--- a/blog/2023-12-30-fast-string/index.html
+++ b/blog/2023-12-30-fast-string/index.html
@@ -1,4 +1,4 @@
-jank's new persistent string is fast
jank's new persistent string is fast
Dec 30, 2023 · Jeaye Wilkerson

One thing I've been meaning to do is build a custom string class for jank. I had some time, during the holidays, between wrapping up this quarter's work and starting on next quarter's, so I decided to see if I could beat both std::string and folly::fbstring, in terms of performance. After all, if we're gonna make a string class, it'll need to be fast. :)

The back story here is that jank needs to be able to get a hash from a string, and that hash should be cached along with the string. This means I can't use existing string classes, since C++ doesn't have the duck typing mechanisms needed to add this behavior without completely wrapping the class.

Now, I could just wrap std::string or folly::fbstring, and all their member functions but I had a couple other goals in mind, too. In particular, I want jank's string to be persistent, as the rest of its data structures are. Also, since I know jank is garbage collected, and the string is persistent, I should be able to do substring operations and string copies by sharing memory, rather than doing deep copies. To summarize these goals shortly:

Goals

  • Be as fast, or faster, than std::string and folly::fbstring
  • Support hashing, with cached value
  • Be immutable (i.e. no copy on substrings, writes only done in constructors, no mutators)
  • Not a goal: Complete standard compliance (which allows me to cheat)

There are three noteworthy C++ string implementations:

  • std::string from GCC's libstdc++
  • std::string from LLVM's libc++
  • folly::fbstring from Facebook's folly

Each of them uses a different memory layout and encoding scheme. GCC's string is the simplest to understand, so I started with that. libc++'s string and folly's string are similar, but folly takes things a step further, so I'm going to skip over talking about the libc++ string entirely. Let's not get ahead of ourselves, though. We're starting with GCC's string.

libstdc++'s string

Each of these strings has some features in common, but they go about it differently. In libstdc++'s string, the overall layout is composed of three things:

  1. A pointer to the char array
  2. The length of the char array
  3. The allocated capacity (which may be more than the length)

However, one commonality each of these strings have is that they employ a "small string optimization" (SSO). SSO is a trick which avoids dynamic allocations by storing small strings within the memory of the string class itself, up to a certain size. To accommodate this, libstdc++'s string has a fourth member, which is a char array of 16 bytes. However, a union is used so that the 16 byte char array actually shares the same memory as the third member listed above, the capacity. Depending on whether or not the string is small (based on its length), the pointer will point at the local buffer or somewhere on the "heap" and the capacity will either actually be the capacity or it'll be part of the memory used to store the small string in-situ (in place).

The code for this would look like:

struct string
+jank's new persistent string is fast
jank's new persistent string is fast
Dec 30, 2023 · Jeaye Wilkerson

One thing I've been meaning to do is build a custom string class for jank. I had some time, during the holidays, between wrapping up this quarter's work and starting on next quarter's, so I decided to see if I could beat both std::string and folly::fbstring, in terms of performance. After all, if we're gonna make a string class, it'll need to be fast. :)

The back story here is that jank needs to be able to get a hash from a string, and that hash should be cached along with the string. This means I can't use existing string classes, since C++ doesn't have the duck typing mechanisms needed to add this behavior without completely wrapping the class.

Now, I could just wrap std::string or folly::fbstring, and all their member functions but I had a couple other goals in mind, too. In particular, I want jank's string to be persistent, as the rest of its data structures are. Also, since I know jank is garbage collected, and the string is persistent, I should be able to do substring operations and string copies by sharing memory, rather than doing deep copies. To summarize these goals shortly:

Goals

  • Be as fast, or faster, than std::string and folly::fbstring
  • Support hashing, with cached value
  • Be immutable (i.e. no copy on substrings, writes only done in constructors, no mutators)
  • Not a goal: Complete standard compliance (which allows me to cheat)

There are three noteworthy C++ string implementations:

  • std::string from GCC's libstdc++
  • std::string from LLVM's libc++
  • folly::fbstring from Facebook's folly

Each of them uses a different memory layout and encoding scheme. GCC's string is the simplest to understand, so I started with that. libc++'s string and folly's string are similar, but folly takes things a step further, so I'm going to skip over talking about the libc++ string entirely. Let's not get ahead of ourselves, though. We're starting with GCC's string.

libstdc++'s string

Each of these strings has some features in common, but they go about it differently. In libstdc++'s string, the overall layout is composed of three things:

  1. A pointer to the char array
  2. The length of the char array
  3. The allocated capacity (which may be more than the length)

However, one commonality each of these strings have is that they employ a "small string optimization" (SSO). SSO is a trick which avoids dynamic allocations by storing small strings within the memory of the string class itself, up to a certain size. To accommodate this, libstdc++'s string has a fourth member, which is a char array of 16 bytes. However, a union is used so that the 16 byte char array actually shares the same memory as the third member listed above, the capacity. Depending on whether or not the string is small (based on its length), the pointer will point at the local buffer or somewhere on the "heap" and the capacity will either actually be the capacity or it'll be part of the memory used to store the small string in-situ (in place).

The code for this would look like:

struct string
 {
   char *data;
   size_t length;
@@ -8,9 +8,9 @@
     char sso[16];
     size_t capacity;
   };
-};

This is a straightforward approach that ends up saving sizeof(size_t) in memory, per string, by overlapping the capacity and the local buffer. If they weren't overlapping, a large string (not using SSO) would have 16 bytes of completely unused and wasted memory, which makes it slower to allocate and, naturally, increases the memory usage of your program.

On a 64 bit system, libstdc++'s string takes up 32 bytes, half of which is used for SSO, including a null-terminator. So that means up to 15 bytes of string data can fit within the string without requiring an allocation. The plus side of this is that, on a 32 bit system, the string will take up 24 bytes and you'll still have up to 15 bytes of string data to use for SSO.

However, I have two key gripes with this design:

  1. The ratio of string bytes to SSO bytes (15:32 = 0.469) is not great
  2. The string is already quite large and I'm looking to add a cached hash to it, which will only make it larger

How can we do better?

folly's string

folly's string, and libc++'s string, take a different approach which is significantly more complex. However, the wins are impressive. So, we start with the same three members:

  1. A pointer to the char array
  2. The length of the char array
  3. The allocated capacity (which may be more than the length)

That's how we handle the string in the large case. However, for the small case, we use a union over all three of those members, spanning the entire string. It looks like this:

struct string
+};

This is a straightforward approach that ends up saving sizeof(size_t) in memory, per string, by overlapping the capacity and the local buffer. If they weren't overlapping, a large string (not using SSO) would have 16 bytes of completely unused and wasted memory, which makes it slower to allocate and, naturally, increases the memory usage of your program.

On a 64 bit system, libstdc++'s string takes up 32 bytes, half of which is used for SSO, including a null-terminator. So that means up to 15 bytes of string data can fit within the string without requiring an allocation. The plus side of this is that, on a 32 bit system, the string will take up 24 bytes and you'll still have up to 15 bytes of string data to use for SSO.

However, I have two key gripes with this design:

  1. The ratio of string bytes to SSO bytes (15:32 = 0.469) is not great
  2. The string is already quite large and I'm looking to add a cached hash to it, which will only make it larger

How can we do better?

folly's string

folly's string, and libc++'s string, take a different approach which is significantly more complex. However, the wins are impressive. So, we start with the same three members:

  1. A pointer to the char array
  2. The length of the char array
  3. The allocated capacity (which may be more than the length)

That's how we handle the string in the large case. However, for the small case, we use a union over all three of those members, spanning the entire string. It looks like this:

struct string
 {
-  struct large_storage
+  struct large_storage
   {
     char *data;
     size_t length;
@@ -26,18 +26,18 @@
 ---------------------------------^^
     /* Actual capacity data. */ |||
               /* Is it large? */ ||
-              /* Is it medium? */ |

For a small string, both of those flag bits are 0. This is important and it's the final piece to a puzzle: where do we store the size, for small strings? Well, we store it in the remaining 6 bits of capacity data. But we don't just store the size, oh no. We store the remaining capacity (max_size - size). This lovely treat allows us to use that final byte as the null terminator when the string is full, since the two flag bits will be 0 and the remaining capacity will be 0, thus the byte will be 0.

This means folly's string allows for 23 bytes of small string data in a 24 byte string. That's 23:24 = 0.958, compared to the previous 15:32 = 0.469. Our string is 24 bytes, compared to previous 32 bytes, too! A very impressive design.

Empty member optimization

There's a trick which all three of the string classes use called empty member optimization and I'll explain it because it's another example of how crazy C++ is. In C++, an empty struct can't have the size of 0. It generally has the size of 1. This is important for addressing, as I'll show here.

struct empty
+              /* Is it medium? */ |

For a small string, both of those flag bits are 0. This is important and it's the final piece to a puzzle: where do we store the size, for small strings? Well, we store it in the remaining 6 bits of capacity data. But we don't just store the size, oh no. We store the remaining capacity (max_size - size). This lovely treat allows us to use that final byte as the null terminator when the string is full, since the two flag bits will be 0 and the remaining capacity will be 0, thus the byte will be 0.

This means folly's string allows for 23 bytes of small string data in a 24 byte string. That's 23:24 = 0.958, compared to the previous 15:32 = 0.469. Our string is 24 bytes, compared to previous 32 bytes, too! A very impressive design.

Empty member optimization

There's a trick which all three of the string classes use called empty member optimization and I'll explain it because it's another example of how crazy C++ is. In C++, an empty struct can't have the size of 0. It generally has the size of 1. This is important for addressing, as I'll show here.

struct empty
 { };
 
-struct foo
+struct foo
 {
   empty e;
   char *p;
-};

In this example, if empty had a size of 0, then both e and p would have the same address within a foo instance. That's not the case. Actually, it's far more complicated. Since empty has the size of 1, so does e, but since a machine word is generally larger (i.e. 64 bits), the space between e and p is filled with padding. On my 64 bit system, e is 1 byte, followed by 7 bytes of padding, followed by 8 bytes for p. The total size of foo is 16 bytes, even though it holds just a single 8 byte pointer.

Why does this matter?

Containers like strings each have an allocator type, which is generally customizable. Most, but not all, allocators are stateless, empty structs, which differ only in their function behavior. The containers need an instance of the allocator in order to be able to use it and they can't make assumptions about what state is in there. But who wants to pay for all of that padding for an empty allocator? Not me. Not the C++ standard library authors.

Fortunately, C++ allows for base classes to be empty, without affecting the size of the derived class. So, we wrap one of our members in a class, inherit from our allocator, and we don't actually need to pay any space or runtime cost for an empty allocator. In folly's string, it looks something like this:

struct string
+};

In this example, if empty had a size of 0, then both e and p would have the same address within a foo instance. That's not the case. Actually, it's far more complicated. Since empty has the size of 1, so does e, but since a machine word is generally larger (i.e. 64 bits), the space between e and p is filled with padding. On my 64 bit system, e is 1 byte, followed by 7 bytes of padding, followed by 8 bytes for p. The total size of foo is 16 bytes, even though it holds just a single 8 byte pointer.

Why does this matter?

Containers like strings each have an allocator type, which is generally customizable. Most, but not all, allocators are stateless, empty structs, which differ only in their function behavior. The containers need an instance of the allocator in order to be able to use it and they can't make assumptions about what state is in there. But who wants to pay for all of that padding for an empty allocator? Not me. Not the C++ standard library authors.

Fortunately, C++ allows for base classes to be empty, without affecting the size of the derived class. So, we wrap one of our members in a class, inherit from our allocator, and we don't actually need to pay any space or runtime cost for an empty allocator. In folly's string, it looks something like this:

struct string
 {
   /* We wrap our union in a struct and inherit from
      our allocator. */
-  struct storage : allocator_type
+  struct storage : allocator_type
   {
     union
     {
diff --git a/blog/2024-02-23-bindings/index.html b/blog/2024-02-23-bindings/index.html
index c648312..57ebe7c 100644
--- a/blog/2024-02-23-bindings/index.html
+++ b/blog/2024-02-23-bindings/index.html
@@ -31,7 +31,7 @@
 ; When we bind *foo* to 100, we add an override.
 '({#'*foo* 100} {#'*foo* 0})
 
-; At the end of each binding form, we pop from the top of the stack.

With this stack of overrides, we just need to update the deref implementation for vars to not just return the root. Instead, they check if the var has any overrides and then returns the latest override, falling back on the root if there is none. In jank's C++ code, that looks like this:

object_ptr var::deref() const
+; At the end of each binding form, we pop from the top of the stack.

With this stack of overrides, we just need to update the deref implementation for vars to not just return the root. Instead, they check if the var has any overrides and then returns the latest override, falling back on the root if there is none. In jank's C++ code, that looks like this:

object_ptr var::deref() const
 {
   auto const binding(get_thread_binding());
   if(binding)
diff --git a/blog/feed.xml b/blog/feed.xml
index 8b52cc9..1c4efc5 100644
--- a/blog/feed.xml
+++ b/blog/feed.xml
@@ -1 +1 @@
-2024-04-26T06:11:02.184184919Zjank bloghttps://jank-lang.org/blog/jank development update - Syntax quoting!2024-03-29T00:00:00Z2024-03-29T00:00:00Zhttps://jank-lang.org/blog/2024-03-29-syntax-quotingJeaye Wilkerson<p>Oh, hey folks. I was just wrapping up this macro I was writing. One moment.</p>jank development update - Dynamic bindings and more!2024-02-23T00:00:00Z2024-02-23T00:00:00Zhttps://jank-lang.org/blog/2024-02-23-bindingsJeaye Wilkerson<p>For the past couple of months, I have been focused on tackling dynamic var bindings and meta hints, which grew into much, much more. Along the way, I&apos;ve learned some neat things and have positioned jank to be ready for a lot more outside contributions. Grab a seat and I&apos;ll explain it all! Much love to <a href="https://www.clojuriststogether.org/">Clojurists Together</a>, who have sponsored some of my work this quarter.</p>jank's new persistent string is fast2023-12-30T00:00:00Z2023-12-30T00:00:00Zhttps://jank-lang.org/blog/2023-12-30-fast-stringJeaye Wilkerson<p>One thing I&apos;ve been meaning to do is build a custom string class for jank. I had some time, during the holidays, between wrapping up this quarter&apos;s work and starting on next quarter&apos;s, so I decided to see if I could beat both <code>std::string</code> and <code>folly::fbstring</code>, in terms of performance. After all, if we&apos;re gonna make a string class, it&apos;ll need to be fast. :)</p>jank development update - Load all the modules!2023-12-17T00:00:00Z2023-12-17T00:00:00Zhttps://jank-lang.org/blog/2023-12-17-module-loadingJeaye Wilkerson<p>I&apos;ve been quiet for the past couple of months, finishing up this work on jank&apos;s module loading, class path handling, aliasing, and var referring. Along the way, I ran into some very interesting bugs and we&apos;re in for a treat of technical detail in this holiday edition of jank development updates! A warm shout out to my <a href="https://github.com/sponsors/jeaye">Github sponsors</a> and <a href="https://www.clojuriststogether.org/">Clojurists Together</a> for sponsoring this work.</p>jank development update - Module loading2023-10-14T00:00:00Z2023-10-14T00:00:00Zhttps://jank-lang.org/blog/2023-10-14-module-loadingJeaye Wilkerson<p>For the past month and a half, I&apos;ve been building out jank&apos;s support for <code>clojure.core/require</code>, including everything from class path handling to compiling jank files to intermediate code written to the filesystem. This is a half-way report for the quarter. As a warm note, my work on jank this quarter is being sponsored by <a href="https://www.clojuriststogether.org/">Clojurists Together</a>.</p>jank development update - Object model results2023-08-26T00:00:00Z2023-08-26T00:00:00Zhttps://jank-lang.org/blog/2023-08-26-object-modelJeaye Wilkerson<p>As summer draws to a close, in the Pacific Northwest, so too does my term of sponsored work focused on a faster object model for jank. Thanks so much to <a href="https://www.clojuriststogether.org/">Clojurists Together</a> for funding jank&apos;s development. The past quarter has been quite successful and I&apos;m excited to share the results.</p>jank development update - A faster object model2023-07-08T00:00:00Z2023-07-08T00:00:00Zhttps://jank-lang.org/blog/2023-07-08-object-modelJeaye Wilkerson<p>This quarter, my work on jank is being sponsored by <a href="https://www.clojuriststogether.org/">Clojurists Together</a>. The terms of the work are to research a new object model for jank, with the goal of making jank code faster across the board. This is a half-way report and I&apos;m excited to share my results!</p>jank development update - Optimizing a ray tracer2023-04-07T00:00:00Z2023-04-07T00:00:00Zhttps://jank-lang.org/blog/2023-04-07-ray-tracingJeaye Wilkerson<p>After the <a href="/blog/2023-01-13-optimizing-sequences">last post</a>, which focused on optimizing jank&apos;s sequences, I wanted to get jank running a ray tracer I had previously written in Clojure. In this post, I document what was required to start ray tracing in jank and, more importantly, how I chased down the run time in a fierce battle with Clojure&apos;s performance.</p>jank development update - Optimizing sequences2023-01-13T00:00:00Z2023-01-13T00:00:00Zhttps://jank-lang.org/blog/2023-01-13-optimizing-sequencesJeaye Wilkerson<p>In this episode of jank&apos;s development updates, we follow an exciting few weekends as I was digging deep into Clojure&apos;s sequence implementation, building jank&apos;s equivalent, and then benchmarking and profiling in a dizzying race to the bottom.</p>jank development update - Lots of new changes2022-12-08T00:00:00Z2022-12-08T00:00:00Zhttps://jank-lang.org/blog/2022-12-08-progress-updateJeaye Wilkerson<p>I was previously giving updates only in the <a href="https://clojurians.slack.com/archives/C03SRH97FDK">#jank</a> Slack channel, but some of these are getting large enough to warrant more prose. Thus, happily, I can announce that jank has a new blog and I have a <i>lot</i> of new progress to report! Let&apos;s get into the details.</p>
\ No newline at end of file
+2024-04-26T06:25:07.305416894Zjank bloghttps://jank-lang.org/blog/jank development update - Syntax quoting!2024-03-29T00:00:00Z2024-03-29T00:00:00Zhttps://jank-lang.org/blog/2024-03-29-syntax-quotingJeaye Wilkerson<p>Oh, hey folks. I was just wrapping up this macro I was writing. One moment.</p>jank development update - Dynamic bindings and more!2024-02-23T00:00:00Z2024-02-23T00:00:00Zhttps://jank-lang.org/blog/2024-02-23-bindingsJeaye Wilkerson<p>For the past couple of months, I have been focused on tackling dynamic var bindings and meta hints, which grew into much, much more. Along the way, I&apos;ve learned some neat things and have positioned jank to be ready for a lot more outside contributions. Grab a seat and I&apos;ll explain it all! Much love to <a href="https://www.clojuriststogether.org/">Clojurists Together</a>, who have sponsored some of my work this quarter.</p>jank's new persistent string is fast2023-12-30T00:00:00Z2023-12-30T00:00:00Zhttps://jank-lang.org/blog/2023-12-30-fast-stringJeaye Wilkerson<p>One thing I&apos;ve been meaning to do is build a custom string class for jank. I had some time, during the holidays, between wrapping up this quarter&apos;s work and starting on next quarter&apos;s, so I decided to see if I could beat both <code>std::string</code> and <code>folly::fbstring</code>, in terms of performance. After all, if we&apos;re gonna make a string class, it&apos;ll need to be fast. :)</p>jank development update - Load all the modules!2023-12-17T00:00:00Z2023-12-17T00:00:00Zhttps://jank-lang.org/blog/2023-12-17-module-loadingJeaye Wilkerson<p>I&apos;ve been quiet for the past couple of months, finishing up this work on jank&apos;s module loading, class path handling, aliasing, and var referring. Along the way, I ran into some very interesting bugs and we&apos;re in for a treat of technical detail in this holiday edition of jank development updates! A warm shout out to my <a href="https://github.com/sponsors/jeaye">Github sponsors</a> and <a href="https://www.clojuriststogether.org/">Clojurists Together</a> for sponsoring this work.</p>jank development update - Module loading2023-10-14T00:00:00Z2023-10-14T00:00:00Zhttps://jank-lang.org/blog/2023-10-14-module-loadingJeaye Wilkerson<p>For the past month and a half, I&apos;ve been building out jank&apos;s support for <code>clojure.core/require</code>, including everything from class path handling to compiling jank files to intermediate code written to the filesystem. This is a half-way report for the quarter. As a warm note, my work on jank this quarter is being sponsored by <a href="https://www.clojuriststogether.org/">Clojurists Together</a>.</p>jank development update - Object model results2023-08-26T00:00:00Z2023-08-26T00:00:00Zhttps://jank-lang.org/blog/2023-08-26-object-modelJeaye Wilkerson<p>As summer draws to a close, in the Pacific Northwest, so too does my term of sponsored work focused on a faster object model for jank. Thanks so much to <a href="https://www.clojuriststogether.org/">Clojurists Together</a> for funding jank&apos;s development. The past quarter has been quite successful and I&apos;m excited to share the results.</p>jank development update - A faster object model2023-07-08T00:00:00Z2023-07-08T00:00:00Zhttps://jank-lang.org/blog/2023-07-08-object-modelJeaye Wilkerson<p>This quarter, my work on jank is being sponsored by <a href="https://www.clojuriststogether.org/">Clojurists Together</a>. The terms of the work are to research a new object model for jank, with the goal of making jank code faster across the board. This is a half-way report and I&apos;m excited to share my results!</p>jank development update - Optimizing a ray tracer2023-04-07T00:00:00Z2023-04-07T00:00:00Zhttps://jank-lang.org/blog/2023-04-07-ray-tracingJeaye Wilkerson<p>After the <a href="/blog/2023-01-13-optimizing-sequences">last post</a>, which focused on optimizing jank&apos;s sequences, I wanted to get jank running a ray tracer I had previously written in Clojure. In this post, I document what was required to start ray tracing in jank and, more importantly, how I chased down the run time in a fierce battle with Clojure&apos;s performance.</p>jank development update - Optimizing sequences2023-01-13T00:00:00Z2023-01-13T00:00:00Zhttps://jank-lang.org/blog/2023-01-13-optimizing-sequencesJeaye Wilkerson<p>In this episode of jank&apos;s development updates, we follow an exciting few weekends as I was digging deep into Clojure&apos;s sequence implementation, building jank&apos;s equivalent, and then benchmarking and profiling in a dizzying race to the bottom.</p>jank development update - Lots of new changes2022-12-08T00:00:00Z2022-12-08T00:00:00Zhttps://jank-lang.org/blog/2022-12-08-progress-updateJeaye Wilkerson<p>I was previously giving updates only in the <a href="https://clojurians.slack.com/archives/C03SRH97FDK">#jank</a> Slack channel, but some of these are getting large enough to warrant more prose. Thus, happily, I can announce that jank has a new blog and I have a <i>lot</i> of new progress to report! Let&apos;s get into the details.</p>
\ No newline at end of file