From fb007f9202d7a858e187ce7a7c6bcd5cacbc15b7 Mon Sep 17 00:00:00 2001
From: Dan Mayer <dan.mayer@shopify.com>
Date: Mon, 10 Feb 2025 21:48:52 -0700
Subject: [PATCH 1/4] add a benchmark and profile script and hook into CI

---
 .github/workflows/benchmarks.yml |  26 ++++
 .github/workflows/profile.yml    |  38 ++++++
 bin/benchmark                    | 226 +++++++++++++++++++++++++++++++
 bin/profile                      | 177 ++++++++++++++++++++++++
 4 files changed, 467 insertions(+)
 create mode 100644 .github/workflows/benchmarks.yml
 create mode 100644 .github/workflows/profile.yml
 create mode 100755 bin/benchmark
 create mode 100755 bin/profile

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 00000000..6ed90a1f
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,26 @@
+name: Benchmarks
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Memcached 1.6.23
+      working-directory: scripts
+      env:
+        MEMCACHED_VERSION: 1.6.23
+      run: |
+        chmod +x ./install_memcached.sh
+        ./install_memcached.sh
+        memcached -d
+        memcached -d -p 11222
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.2
+        bundler-cache: true # 'bundle install' and cache
+    - name: Run Benchmarks
+      run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/benchmark
diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml
new file mode 100644
index 00000000..e4e59004
--- /dev/null
+++ b/.github/workflows/profile.yml
@@ -0,0 +1,38 @@
+name: Profiles
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install Memcached 1.6.23
+      working-directory: scripts
+      env:
+        MEMCACHED_VERSION: 1.6.23
+      run: |
+        chmod +x ./install_memcached.sh
+        ./install_memcached.sh
+        memcached -d
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: 3.4
+        bundler-cache: true # 'bundle install' and cache
+    - name: Run Profiles
+      run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/profile
+    - name: Upload profile results
+      uses: actions/upload-artifact@v4
+      with:
+        name: profile-results
+        path: |
+          client_get_profile.json
+          socket_get_profile.json
+          client_set_profile.json
+          socket_set_profile.json
+          client_get_multi_profile.json
+          socket_get_multi_profile.json
+          client_set_multi_profile.json
+          socket_set_multi_profile.json
diff --git a/bin/benchmark b/bin/benchmark
new file mode 100755
index 00000000..a1cfe33d
--- /dev/null
+++ b/bin/benchmark
@@ -0,0 +1,226 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+# This helps benchmark current performance of Dalli
+# as well as compare performance of optimizated and non-optimized calls like multi-set vs set
+#
+# run with:
+# bundle exec bin/benchmark
+# RUBY_YJIT_ENABLE=1 BENCH_TARGET=get bundle exec bin/benchmark
+require 'bundler/inline'
+require 'json'
+
+gemfile do
+  source 'https://rubygems.org'
+  gem 'benchmark-ips'
+  gem 'logger'
+end
+
+require_relative '../lib/dalli'
+require 'benchmark/ips'
+require 'monitor'
+
+##
+# StringSerializer is a serializer that avoids the overhead of Marshal or JSON.
+##
+class StringSerializer
+  def self.dump(value)
+    value
+  end
+
+  def self.load(value)
+    value
+  end
+end
+
+dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211"
+
+if dalli_url.include?('unix')
+  ENV['BENCH_CACHE_URL'].gsub('unix://','')
+end
+bench_target = ENV['BENCH_TARGET'] || 'set'
+bench_time = (ENV['BENCH_TIME'] || 10).to_i
+bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i
+bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i
+payload = 'B' * bench_payload_size
+TERMINATOR = "\r\n"
+puts "yjit: #{RubyVM::YJIT.enabled?}"
+
+client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true)
+multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, raw: true)
+
+# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
+# in the library.
+sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1)
+sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
+sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true)
+# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size
+# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8)
+# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size
+sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
+
+# ensure the clients are all connected and working
+client.set('key', payload)
+sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n")
+sock.write(payload)
+sock.write(TERMINATOR)
+sock.flush
+sock.readline # clear the buffer
+
+# ensure we have basic data for the benchmarks and get calls
+payload_smaller = 'B' * 50_000
+pairs = {}
+100.times do |i|
+  pairs["multi_#{i}"] = payload_smaller
+end
+client.quiet do
+  pairs.each do |key, value|
+    client.set(key, value, 3600, raw: true)
+  end
+end
+
+###
+# GC Suite
+# benchmark without GC skewing things
+###
+class GCSuite
+  def warming(*)
+    run_gc
+  end
+
+  def running(*)
+    run_gc
+  end
+
+  def warmup_stats(*); end
+
+  def add_report(*); end
+
+  private
+
+  def run_gc
+    GC.enable
+    GC.start
+    GC.disable
+  end
+end
+suite = GCSuite.new
+
+def sock_get_multi(sock, pairs)
+  count = pairs.length
+  pairs.each_key do |key|
+    count -= 1
+    tail = count.zero? ? '' : 'q'
+    sock.write("mg #{key} v f k #{tail}\r\n")
+  end
+  sock.flush
+  # read all the memcached responses back and build a hash of key value pairs
+  results = {}
+  last_result = false
+  while (line = sock.readline.chomp!(TERMINATOR)) != ''
+    last_result = true if line.start_with?('EN ')
+    next unless line.start_with?('VA ') || last_result
+
+    _, value_length, _flags, key = line.split
+    results[key[1..]] = sock.read(value_length.to_i)
+    sock.read(TERMINATOR.length)
+    break if results.size == pairs.size
+    break if last_result
+  end
+  results
+end
+
+
+if %w[all set].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('client set') { client.set('key', payload) }
+    #x.report('multi client set') { multi_client.set('string_key', payload) }
+    x.report('raw sock set') do
+      sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n")
+      sock.write(payload)
+      sock.write("\r\n")
+      sock.flush
+      sock.readline # clear the buffer
+    end
+    x.compare!
+  end
+end
+
+@lock = Monitor.new
+if %w[all get].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('get dalli') { client.get('key') }
+    # NOTE: while this is the fastest it is not thread safe and is blocking vs IO sharing friendly
+    x.report('get sock') do
+      sock.write("get sock_key\r\n")
+      sock.readline
+      sock.read(payload.bytesize)
+    end
+    # NOTE: This shows that when adding thread safety & non-blocking IO we are slower for single process/thread use case
+    x.report('get sock non-blocking') do
+      @lock.synchronize do
+        sock.write("get sock_key\r\n")
+        sock.readline
+        count = payload.bytesize
+        value = String.new(capacity: count + 1)
+        loop do
+          begin
+            value << sock.read_nonblock(count - value.bytesize)
+          rescue Errno::EAGAIN
+            IO.select([sock])
+            retry
+          rescue EOFError
+            puts "EOFError"
+            break
+          end
+          break if value.bytesize == count
+        end
+      end
+    end
+    x.compare!
+  end
+end
+
+if %w[all get_multi].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('get 100 keys') { client.get_multi(pairs.keys) }
+    x.report('get 100 keys raw sock') { sock_get_multi(sock, pairs) }
+    x.compare!
+  end
+end
+
+if %w[all set_multi].include?(bench_target)
+  Benchmark.ips do |x|
+    x.config(warmup: bench_warmup, time: bench_time, suite: suite)
+    x.report('write 100 keys simple') do
+      client.quiet do
+        pairs.each do |key, value|
+          client.set(key, value, 3600, raw: true)
+        end
+      end
+    end
+    x.report('multi client set_multi 100') do
+      multi_client.set_multi(pairs, 3600, raw: true)
+    end
+    x.report('write 100 keys rawsock') do
+      count = pairs.length
+      tail = ''
+      value_bytesize = payload_smaller.bytesize
+      ttl = 3600
+
+      pairs.each do |key, value|
+        count -= 1
+        tail = count.zero? ? '' : 'q'
+        sock.write(String.new("ms #{key} #{value_bytesize} c F0 T#{ttl} MS #{tail}\r\n",
+                              capacity: key.size + value_bytesize + 40) << value << TERMINATOR)
+      end
+      sock.flush
+      sock.gets(TERMINATOR) # clear the buffer
+    end
+    x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) }
+    x.compare!
+  end
+end
diff --git a/bin/profile b/bin/profile
new file mode 100755
index 00000000..f630d83c
--- /dev/null
+++ b/bin/profile
@@ -0,0 +1,177 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+# This helps profile specific call paths in Dalli
+# finding and fixing performance issues in these profiles should result in improvements in the dalli benchmarks
+#
+# run with:
+# RUBY_YJIT_ENABLE=1 bundle exec bin/profile
+require 'bundler/inline'
+require 'json'
+
+gemfile do
+  source 'https://rubygems.org'
+  gem 'benchmark-ips'
+  gem 'vernier'
+  gem 'logger'
+end
+
+require_relative '../lib/dalli'
+require 'benchmark/ips'
+require 'vernier'
+
+##
+# StringSerializer is a serializer that avoids the overhead of Marshal or JSON.
+##
+class StringSerializer
+  def self.dump(value)
+    value
+  end
+
+  def self.load(value)
+    value
+  end
+end
+
+dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211"
+
+if dalli_url.include?('unix')
+  ENV['BENCH_CACHE_URL'].gsub('unix://','')
+end
+bench_target = ENV['BENCH_TARGET'] || 'get'
+bench_time = (ENV['BENCH_TIME'] || 10).to_i
+bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i
+TERMINATOR = "\r\n"
+puts "yjit: #{RubyVM::YJIT.enabled?}"
+
+client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false)
+
+# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
+# in the library.
+sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1)
+sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true)
+sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true)
+# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size
+# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8)
+# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size
+sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
+
+payload = 'B' * bench_payload_size
+dalli_key = 'dalli_key'
+# ensure the clients are all connected and working
+client.set(dalli_key, payload)
+sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n")
+sock.write(payload)
+sock.write(TERMINATOR)
+sock.flush
+sock.readline # clear the buffer
+
+# ensure we have basic data for the benchmarks and get calls
+payload_smaller = 'B' * 50_000
+pairs = {}
+100.times do |i|
+  pairs["multi_#{i}"] = payload_smaller
+end
+client.quiet do
+  pairs.each do |key, value|
+    client.set(key, value, 3600, raw: true)
+  end
+end
+
+def sock_get_multi(sock, pairs)
+  count = pairs.length
+  pairs.each_key do |key|
+    count -= 1
+    tail = count.zero? ? '' : 'q'
+    sock.write("mg #{key} v f k #{tail}\r\n")
+  end
+  sock.flush
+  # read all the memcached responses back and build a hash of key value pairs
+  results = {}
+  last_result = false
+  while (line = sock.readline.chomp!(TERMINATOR)) != ''
+    last_result = true if line.start_with?('EN ')
+    next unless line.start_with?('VA ') || last_result
+
+    _, value_length, _flags, key = line.split
+    results[key[1..]] = sock.read(value_length.to_i)
+    sock.read(TERMINATOR.length)
+    break if results.size == pairs.size
+    break if last_result
+  end
+  results
+end
+
+def sock_set_multi(sock, pairs)
+  count = pairs.length
+  tail = ''
+  ttl = 3600
+
+  pairs.each do |key, value|
+    count -= 1
+    tail = count.zero? ? '' : 'q'
+    sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n", capacity: key.size + value.bytesize + 40))
+    sock.write(value)
+    sock.write(TERMINATOR)
+  end
+  sock.flush
+  sock.gets(TERMINATOR) # clear the buffer
+end
+
+if %w[all get].include?(bench_target)
+  Vernier.profile(out: 'client_get_profile.json') do
+    start_time = Time.now
+    client.get(dalli_key) while Time.now - start_time < bench_time
+  end
+
+  Vernier.profile(out: 'socket_get_profile.json') do
+    start_time = Time.now
+    while Time.now - start_time < bench_time do
+      sock.write("get sock_key\r\n")
+      sock.readline
+      sock.read(payload.bytesize)
+    end
+  end
+end
+
+if %w[all set].include?(bench_target)
+  Vernier.profile(out: 'client_set_profile.json') do
+    start_time = Time.now
+    client.set(dalli_key, payload, 3600, raw: true) while Time.now - start_time < bench_time
+  end
+
+  Vernier.profile(out: 'socket_set_profile.json') do
+    start_time = Time.now
+    while Time.now - start_time < bench_time
+      sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n")
+      sock.write(payload)
+      sock.write("\r\n")
+      sock.flush
+      sock.readline # clear the buffer
+    end
+  end
+end
+
+if %w[all get_multi].include?(bench_target)
+  Vernier.profile(out: 'client_get_multi_profile.json') do
+    start_time = Time.now
+    client.get_multi(pairs.keys) while Time.now - start_time < bench_time
+  end
+
+  Vernier.profile(out: 'socket_get_multi_profile.json') do
+    start_time = Time.now
+    sock_get_multi(sock, pairs) while Time.now - start_time < bench_time
+  end
+end
+
+if %w[all set_multi].include?(bench_target)
+  Vernier.profile(out: 'client_set_multi_profile.json') do
+    start_time = Time.now
+    client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time
+  end
+
+  Vernier.profile(out: 'socket_set_multi_profile.json') do
+    start_time = Time.now
+    sock_set_multi(sock, pairs) while Time.now - start_time < bench_time
+  end
+end

From 15cb11d8c7b38c739a827a0a6fcf9b59ad76f1c9 Mon Sep 17 00:00:00 2001
From: Dan Mayer <dan.mayer@shopify.com>
Date: Mon, 10 Feb 2025 22:11:26 -0700
Subject: [PATCH 2/4] fix rubocop complaints

---
 bin/benchmark | 24 +++++++++++++++---------
 bin/profile   | 19 +++++++++++++------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/bin/benchmark b/bin/benchmark
index a1cfe33d..96802ae2 100755
--- a/bin/benchmark
+++ b/bin/benchmark
@@ -33,11 +33,9 @@ class StringSerializer
   end
 end
 
-dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211"
+dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211'
 
-if dalli_url.include?('unix')
-  ENV['BENCH_CACHE_URL'].gsub('unix://','')
-end
+ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix')
 bench_target = ENV['BENCH_TARGET'] || 'set'
 bench_time = (ENV['BENCH_TIME'] || 10).to_i
 bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i
@@ -47,7 +45,8 @@ TERMINATOR = "\r\n"
 puts "yjit: #{RubyVM::YJIT.enabled?}"
 
 client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true)
-multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, raw: true)
+multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false,
+                                                                    raw: true)
 
 # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
 # in the library.
@@ -106,6 +105,10 @@ class GCSuite
 end
 suite = GCSuite.new
 
+# rubocop:disable Metrics/MethodLength
+# rubocop:disable Metrics/PerceivedComplexity
+# rubocop:disable Metrics/AbcSize
+# rubocop:disable Metrics/CyclomaticComplexity
 def sock_get_multi(sock, pairs)
   count = pairs.length
   pairs.each_key do |key|
@@ -129,13 +132,16 @@ def sock_get_multi(sock, pairs)
   end
   results
 end
-
+# rubocop:enable Metrics/MethodLength
+# rubocop:enable Metrics/PerceivedComplexity
+# rubocop:enable Metrics/AbcSize
+# rubocop:enable Metrics/CyclomaticComplexity
 
 if %w[all set].include?(bench_target)
   Benchmark.ips do |x|
     x.config(warmup: bench_warmup, time: bench_time, suite: suite)
     x.report('client set') { client.set('key', payload) }
-    #x.report('multi client set') { multi_client.set('string_key', payload) }
+    # x.report('multi client set') { multi_client.set('string_key', payload) }
     x.report('raw sock set') do
       sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n")
       sock.write(payload)
@@ -169,10 +175,10 @@ if %w[all get].include?(bench_target)
           begin
             value << sock.read_nonblock(count - value.bytesize)
           rescue Errno::EAGAIN
-            IO.select([sock])
+            sock.wait_readable
             retry
           rescue EOFError
-            puts "EOFError"
+            puts 'EOFError'
             break
           end
           break if value.bytesize == count
diff --git a/bin/profile b/bin/profile
index f630d83c..33d10a64 100755
--- a/bin/profile
+++ b/bin/profile
@@ -33,11 +33,9 @@ class StringSerializer
   end
 end
 
-dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211"
+dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211'
 
-if dalli_url.include?('unix')
-  ENV['BENCH_CACHE_URL'].gsub('unix://','')
-end
+ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix')
 bench_target = ENV['BENCH_TARGET'] || 'get'
 bench_time = (ENV['BENCH_TIME'] || 10).to_i
 bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i
@@ -78,6 +76,10 @@ client.quiet do
   end
 end
 
+# rubocop:disable Metrics/MethodLength
+# rubocop:disable Metrics/PerceivedComplexity
+# rubocop:disable Metrics/AbcSize
+# rubocop:disable Metrics/CyclomaticComplexity
 def sock_get_multi(sock, pairs)
   count = pairs.length
   pairs.each_key do |key|
@@ -101,6 +103,9 @@ def sock_get_multi(sock, pairs)
   end
   results
 end
+# rubocop:enable Metrics/PerceivedComplexity
+# rubocop:enable Metrics/AbcSize
+# rubocop:enable Metrics/CyclomaticComplexity
 
 def sock_set_multi(sock, pairs)
   count = pairs.length
@@ -110,13 +115,15 @@ def sock_set_multi(sock, pairs)
   pairs.each do |key, value|
     count -= 1
     tail = count.zero? ? '' : 'q'
-    sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n", capacity: key.size + value.bytesize + 40))
+    sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n",
+                          capacity: key.size + value.bytesize + 40))
     sock.write(value)
     sock.write(TERMINATOR)
   end
   sock.flush
   sock.gets(TERMINATOR) # clear the buffer
 end
+# rubocop:enable Metrics/MethodLength
 
 if %w[all get].include?(bench_target)
   Vernier.profile(out: 'client_get_profile.json') do
@@ -126,7 +133,7 @@ if %w[all get].include?(bench_target)
 
   Vernier.profile(out: 'socket_get_profile.json') do
     start_time = Time.now
-    while Time.now - start_time < bench_time do
+    while Time.now - start_time < bench_time
       sock.write("get sock_key\r\n")
       sock.readline
       sock.read(payload.bytesize)

From d872856b4bc9fb46a42d47b39b19387db085cff9 Mon Sep 17 00:00:00 2001
From: Dan Mayer <dan.mayer@shopify.com>
Date: Mon, 10 Feb 2025 22:17:52 -0700
Subject: [PATCH 3/4] leave out set_multi until we send that PR

---
 bin/benchmark | 13 +++++++------
 bin/profile   |  8 +++++++-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/bin/benchmark b/bin/benchmark
index 96802ae2..70c520cd 100755
--- a/bin/benchmark
+++ b/bin/benchmark
@@ -45,8 +45,8 @@ TERMINATOR = "\r\n"
 puts "yjit: #{RubyVM::YJIT.enabled?}"
 
 client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true)
-multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false,
-                                                                    raw: true)
+# multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false,
+#                                                                    raw: true)
 
 # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
 # in the library.
@@ -208,9 +208,10 @@ if %w[all set_multi].include?(bench_target)
         end
       end
     end
-    x.report('multi client set_multi 100') do
-      multi_client.set_multi(pairs, 3600, raw: true)
-    end
+    # TODO: uncomment this once we add PR adding set_multi
+    # x.report('multi client set_multi 100') do
+    #   multi_client.set_multi(pairs, 3600, raw: true)
+    # end
     x.report('write 100 keys rawsock') do
       count = pairs.length
       tail = ''
@@ -226,7 +227,7 @@ if %w[all set_multi].include?(bench_target)
       sock.flush
       sock.gets(TERMINATOR) # clear the buffer
     end
-    x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) }
+    # x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) }
     x.compare!
   end
 end
diff --git a/bin/profile b/bin/profile
index 33d10a64..46a02f04 100755
--- a/bin/profile
+++ b/bin/profile
@@ -174,7 +174,13 @@ end
 if %w[all set_multi].include?(bench_target)
   Vernier.profile(out: 'client_set_multi_profile.json') do
     start_time = Time.now
-    client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time
+    # until we port over set_multi, compare the simple loop
+    # client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time
+    while Time.now - start_time < bench_time
+      pairs.each do |key, value|
+        client.set(key, value, 3600, raw: true)
+      end
+    end
   end
 
   Vernier.profile(out: 'socket_set_multi_profile.json') do

From f51fbcc4bf348e29f44ce56e455889ee5eabb30f Mon Sep 17 00:00:00 2001
From: Dan Mayer <dan.mayer@shopify.com>
Date: Tue, 11 Feb 2025 16:16:39 -0700
Subject: [PATCH 4/4] improvements to profile and benchmark based on feedback

---
 bin/benchmark | 48 +++++++++++++++++++++++++++++++++++-------------
 bin/profile   | 29 +++++++++++++++++++----------
 2 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/bin/benchmark b/bin/benchmark
index 70c520cd..8894fb5a 100755
--- a/bin/benchmark
+++ b/bin/benchmark
@@ -34,8 +34,6 @@ class StringSerializer
 end
 
 dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211'
-
-ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix')
 bench_target = ENV['BENCH_TARGET'] || 'set'
 bench_time = (ENV['BENCH_TIME'] || 10).to_i
 bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i
@@ -44,9 +42,9 @@ payload = 'B' * bench_payload_size
 TERMINATOR = "\r\n"
 puts "yjit: #{RubyVM::YJIT.enabled?}"
 
-client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true)
-# multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false,
-#                                                                    raw: true)
+client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false, raw: true)
+multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false,
+                                                                    raw: true)
 
 # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
 # in the library.
@@ -56,18 +54,29 @@ sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true)
 # Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size
 # sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8)
 # Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size
-sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
+# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
 
 # ensure the clients are all connected and working
 client.set('key', payload)
+multi_client.set('multi_key', payload)
 sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n")
 sock.write(payload)
 sock.write(TERMINATOR)
 sock.flush
 sock.readline # clear the buffer
 
+raise 'dalli client mismatch' if payload != client.get('key')
+
+raise 'multi dalli client mismatch' if payload != multi_client.get('multi_key')
+
+sock.write("mg sock_key v\r\n")
+sock.readline
+sock_value = sock.read(payload.bytesize)
+sock.read(TERMINATOR.bytesize)
+raise 'sock mismatch' if payload != sock_value
+
 # ensure we have basic data for the benchmarks and get calls
-payload_smaller = 'B' * 50_000
+payload_smaller = 'B' * (bench_payload_size / 10)
 pairs = {}
 100.times do |i|
   pairs["multi_#{i}"] = payload_smaller
@@ -157,17 +166,22 @@ end
 if %w[all get].include?(bench_target)
   Benchmark.ips do |x|
     x.config(warmup: bench_warmup, time: bench_time, suite: suite)
-    x.report('get dalli') { client.get('key') }
+    x.report('get dalli') do
+      result = client.get('key')
+      raise 'mismatch' unless result == payload
+    end
     # NOTE: while this is the fastest it is not thread safe and is blocking vs IO sharing friendly
     x.report('get sock') do
-      sock.write("get sock_key\r\n")
+      sock.write("mg sock_key v\r\n")
       sock.readline
-      sock.read(payload.bytesize)
+      result = sock.read(payload.bytesize)
+      sock.read(TERMINATOR.bytesize)
+      raise 'mismatch' unless result == payload
     end
     # NOTE: This shows that when adding thread safety & non-blocking IO we are slower for single process/thread use case
     x.report('get sock non-blocking') do
       @lock.synchronize do
-        sock.write("get sock_key\r\n")
+        sock.write("mg sock_key v\r\n")
         sock.readline
         count = payload.bytesize
         value = String.new(capacity: count + 1)
@@ -183,6 +197,8 @@ if %w[all get].include?(bench_target)
           end
           break if value.bytesize == count
         end
+        sock.read(TERMINATOR.bytesize)
+        raise 'mismatch' unless value == payload
       end
     end
     x.compare!
@@ -192,8 +208,14 @@ end
 if %w[all get_multi].include?(bench_target)
   Benchmark.ips do |x|
     x.config(warmup: bench_warmup, time: bench_time, suite: suite)
-    x.report('get 100 keys') { client.get_multi(pairs.keys) }
-    x.report('get 100 keys raw sock') { sock_get_multi(sock, pairs) }
+    x.report('get 100 keys') do
+      result = client.get_multi(pairs.keys)
+      raise 'mismatch' unless result == pairs
+    end
+    x.report('get 100 keys raw sock') do
+      result = sock_get_multi(sock, pairs)
+      raise 'mismatch' unless result == pairs
+    end
     x.compare!
   end
 end
diff --git a/bin/profile b/bin/profile
index 46a02f04..cd2b32c6 100755
--- a/bin/profile
+++ b/bin/profile
@@ -34,15 +34,13 @@ class StringSerializer
 end
 
 dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211'
-
-ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix')
 bench_target = ENV['BENCH_TARGET'] || 'get'
 bench_time = (ENV['BENCH_TIME'] || 10).to_i
 bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i
 TERMINATOR = "\r\n"
 puts "yjit: #{RubyVM::YJIT.enabled?}"
 
-client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false)
+client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false)
 
 # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions
 # in the library.
@@ -52,7 +50,7 @@ sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true)
 # Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size
 # sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8)
 # Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size
-sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
+# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8)
 
 payload = 'B' * bench_payload_size
 dalli_key = 'dalli_key'
@@ -65,7 +63,7 @@ sock.flush
 sock.readline # clear the buffer
 
 # ensure we have basic data for the benchmarks and get calls
-payload_smaller = 'B' * 50_000
+payload_smaller = 'B' * (bench_payload_size / 10)
 pairs = {}
 100.times do |i|
   pairs["multi_#{i}"] = payload_smaller
@@ -128,15 +126,20 @@ end
 if %w[all get].include?(bench_target)
   Vernier.profile(out: 'client_get_profile.json') do
     start_time = Time.now
-    client.get(dalli_key) while Time.now - start_time < bench_time
+    while Time.now - start_time < bench_time
+      result = client.get(dalli_key)
+      raise 'mismatch' unless result == payload
+    end
   end
 
   Vernier.profile(out: 'socket_get_profile.json') do
     start_time = Time.now
     while Time.now - start_time < bench_time
-      sock.write("get sock_key\r\n")
+      sock.write("mg sock_key v\r\n")
       sock.readline
-      sock.read(payload.bytesize)
+      result = sock.read(payload.bytesize)
+      sock.read(TERMINATOR.bytesize)
+      raise 'mismatch' unless result == payload
     end
   end
 end
@@ -162,12 +165,18 @@ end
 if %w[all get_multi].include?(bench_target)
   Vernier.profile(out: 'client_get_multi_profile.json') do
     start_time = Time.now
-    client.get_multi(pairs.keys) while Time.now - start_time < bench_time
+    while Time.now - start_time < bench_time
+      result = client.get_multi(pairs.keys)
+      raise 'mismatch' unless result == pairs
+    end
   end
 
   Vernier.profile(out: 'socket_get_multi_profile.json') do
     start_time = Time.now
-    sock_get_multi(sock, pairs) while Time.now - start_time < bench_time
+    while Time.now - start_time < bench_time
+      result = sock_get_multi(sock, pairs)
+      raise 'mismatch' unless result == pairs
+    end
   end
 end