From fb007f9202d7a858e187ce7a7c6bcd5cacbc15b7 Mon Sep 17 00:00:00 2001 From: Dan Mayer Date: Mon, 10 Feb 2025 21:48:52 -0700 Subject: [PATCH 1/4] add a benchmark and profile script and hook into CI --- .github/workflows/benchmarks.yml | 26 ++++ .github/workflows/profile.yml | 38 ++++++ bin/benchmark | 226 +++++++++++++++++++++++++++++++ bin/profile | 177 ++++++++++++++++++++++++ 4 files changed, 467 insertions(+) create mode 100644 .github/workflows/benchmarks.yml create mode 100644 .github/workflows/profile.yml create mode 100755 bin/benchmark create mode 100755 bin/profile diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 00000000..6ed90a1f --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,26 @@ +name: Benchmarks + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Install Memcached 1.6.23 + working-directory: scripts + env: + MEMCACHED_VERSION: 1.6.23 + run: | + chmod +x ./install_memcached.sh + ./install_memcached.sh + memcached -d + memcached -d -p 11222 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.2 + bundler-cache: true # 'bundle install' and cache + - name: Run Benchmarks + run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/benchmark diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml new file mode 100644 index 00000000..e4e59004 --- /dev/null +++ b/.github/workflows/profile.yml @@ -0,0 +1,38 @@ +name: Profiles + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Install Memcached 1.6.23 + working-directory: scripts + env: + MEMCACHED_VERSION: 1.6.23 + run: | + chmod +x ./install_memcached.sh + ./install_memcached.sh + memcached -d + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.4 + bundler-cache: true # 'bundle install' and cache + - name: Run Profiles + run: RUBY_YJIT_ENABLE=1 BENCH_TARGET=all bundle exec bin/profile + - name: Upload profile results + uses: actions/upload-artifact@v4 + with: + name: profile-results + path: | + client_get_profile.json + socket_get_profile.json + client_set_profile.json + socket_set_profile.json + client_get_multi_profile.json + socket_get_multi_profile.json + client_set_multi_profile.json + socket_set_multi_profile.json diff --git a/bin/benchmark b/bin/benchmark new file mode 100755 index 00000000..a1cfe33d --- /dev/null +++ b/bin/benchmark @@ -0,0 +1,226 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# This helps benchmark current performance of Dalli +# as well as compare performance of optimizated and non-optimized calls like multi-set vs set +# +# run with: +# bundle exec bin/benchmark +# RUBY_YJIT_ENABLE=1 BENCH_TARGET=get bundle exec bin/benchmark +require 'bundler/inline' +require 'json' + +gemfile do + source 'https://rubygems.org' + gem 'benchmark-ips' + gem 'logger' +end + +require_relative '../lib/dalli' +require 'benchmark/ips' +require 'monitor' + +## +# StringSerializer is a serializer that avoids the overhead of Marshal or JSON. +## +class StringSerializer + def self.dump(value) + value + end + + def self.load(value) + value + end +end + +dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211" + +if dalli_url.include?('unix') + ENV['BENCH_CACHE_URL'].gsub('unix://','') +end +bench_target = ENV['BENCH_TARGET'] || 'set' +bench_time = (ENV['BENCH_TIME'] || 10).to_i +bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i +bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i +payload = 'B' * bench_payload_size +TERMINATOR = "\r\n" +puts "yjit: #{RubyVM::YJIT.enabled?}" + +client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true) +multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, raw: true) + +# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions +# in the library. +sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1) +sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true) +sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true) +# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size +# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8) +# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size +sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) + +# ensure the clients are all connected and working +client.set('key', payload) +sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n") +sock.write(payload) +sock.write(TERMINATOR) +sock.flush +sock.readline # clear the buffer + +# ensure we have basic data for the benchmarks and get calls +payload_smaller = 'B' * 50_000 +pairs = {} +100.times do |i| + pairs["multi_#{i}"] = payload_smaller +end +client.quiet do + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end +end + +### +# GC Suite +# benchmark without GC skewing things +### +class GCSuite + def warming(*) + run_gc + end + + def running(*) + run_gc + end + + def warmup_stats(*); end + + def add_report(*); end + + private + + def run_gc + GC.enable + GC.start + GC.disable + end +end +suite = GCSuite.new + +def sock_get_multi(sock, pairs) + count = pairs.length + pairs.each_key do |key| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write("mg #{key} v f k #{tail}\r\n") + end + sock.flush + # read all the memcached responses back and build a hash of key value pairs + results = {} + last_result = false + while (line = sock.readline.chomp!(TERMINATOR)) != '' + last_result = true if line.start_with?('EN ') + next unless line.start_with?('VA ') || last_result + + _, value_length, _flags, key = line.split + results[key[1..]] = sock.read(value_length.to_i) + sock.read(TERMINATOR.length) + break if results.size == pairs.size + break if last_result + end + results +end + + +if %w[all set].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('client set') { client.set('key', payload) } + #x.report('multi client set') { multi_client.set('string_key', payload) } + x.report('raw sock set') do + sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n") + sock.write(payload) + sock.write("\r\n") + sock.flush + sock.readline # clear the buffer + end + x.compare! + end +end + +@lock = Monitor.new +if %w[all get].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('get dalli') { client.get('key') } + # NOTE: while this is the fastest it is not thread safe and is blocking vs IO sharing friendly + x.report('get sock') do + sock.write("get sock_key\r\n") + sock.readline + sock.read(payload.bytesize) + end + # NOTE: This shows that when adding thread safety & non-blocking IO we are slower for single process/thread use case + x.report('get sock non-blocking') do + @lock.synchronize do + sock.write("get sock_key\r\n") + sock.readline + count = payload.bytesize + value = String.new(capacity: count + 1) + loop do + begin + value << sock.read_nonblock(count - value.bytesize) + rescue Errno::EAGAIN + IO.select([sock]) + retry + rescue EOFError + puts "EOFError" + break + end + break if value.bytesize == count + end + end + end + x.compare! + end +end + +if %w[all get_multi].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('get 100 keys') { client.get_multi(pairs.keys) } + x.report('get 100 keys raw sock') { sock_get_multi(sock, pairs) } + x.compare! + end +end + +if %w[all set_multi].include?(bench_target) + Benchmark.ips do |x| + x.config(warmup: bench_warmup, time: bench_time, suite: suite) + x.report('write 100 keys simple') do + client.quiet do + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end + end + end + x.report('multi client set_multi 100') do + multi_client.set_multi(pairs, 3600, raw: true) + end + x.report('write 100 keys rawsock') do + count = pairs.length + tail = '' + value_bytesize = payload_smaller.bytesize + ttl = 3600 + + pairs.each do |key, value| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write(String.new("ms #{key} #{value_bytesize} c F0 T#{ttl} MS #{tail}\r\n", + capacity: key.size + value_bytesize + 40) << value << TERMINATOR) + end + sock.flush + sock.gets(TERMINATOR) # clear the buffer + end + x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) } + x.compare! + end +end diff --git a/bin/profile b/bin/profile new file mode 100755 index 00000000..f630d83c --- /dev/null +++ b/bin/profile @@ -0,0 +1,177 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# This helps profile specific call paths in Dalli +# finding and fixing performance issues in these profiles should result in improvements in the dalli benchmarks +# +# run with: +# RUBY_YJIT_ENABLE=1 bundle exec bin/profile +require 'bundler/inline' +require 'json' + +gemfile do + source 'https://rubygems.org' + gem 'benchmark-ips' + gem 'vernier' + gem 'logger' +end + +require_relative '../lib/dalli' +require 'benchmark/ips' +require 'vernier' + +## +# StringSerializer is a serializer that avoids the overhead of Marshal or JSON. +## +class StringSerializer + def self.dump(value) + value + end + + def self.load(value) + value + end +end + +dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211" + +if dalli_url.include?('unix') + ENV['BENCH_CACHE_URL'].gsub('unix://','') +end +bench_target = ENV['BENCH_TARGET'] || 'get' +bench_time = (ENV['BENCH_TIME'] || 10).to_i +bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i +TERMINATOR = "\r\n" +puts "yjit: #{RubyVM::YJIT.enabled?}" + +client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false) + +# The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions +# in the library. +sock = TCPSocket.new('127.0.0.1', '11211', connect_timeout: 1) +sock.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_NODELAY, true) +sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true) +# Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size +# sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8) +# Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size +sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) + +payload = 'B' * bench_payload_size +dalli_key = 'dalli_key' +# ensure the clients are all connected and working +client.set(dalli_key, payload) +sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n") +sock.write(payload) +sock.write(TERMINATOR) +sock.flush +sock.readline # clear the buffer + +# ensure we have basic data for the benchmarks and get calls +payload_smaller = 'B' * 50_000 +pairs = {} +100.times do |i| + pairs["multi_#{i}"] = payload_smaller +end +client.quiet do + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end +end + +def sock_get_multi(sock, pairs) + count = pairs.length + pairs.each_key do |key| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write("mg #{key} v f k #{tail}\r\n") + end + sock.flush + # read all the memcached responses back and build a hash of key value pairs + results = {} + last_result = false + while (line = sock.readline.chomp!(TERMINATOR)) != '' + last_result = true if line.start_with?('EN ') + next unless line.start_with?('VA ') || last_result + + _, value_length, _flags, key = line.split + results[key[1..]] = sock.read(value_length.to_i) + sock.read(TERMINATOR.length) + break if results.size == pairs.size + break if last_result + end + results +end + +def sock_set_multi(sock, pairs) + count = pairs.length + tail = '' + ttl = 3600 + + pairs.each do |key, value| + count -= 1 + tail = count.zero? ? '' : 'q' + sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n", capacity: key.size + value.bytesize + 40)) + sock.write(value) + sock.write(TERMINATOR) + end + sock.flush + sock.gets(TERMINATOR) # clear the buffer +end + +if %w[all get].include?(bench_target) + Vernier.profile(out: 'client_get_profile.json') do + start_time = Time.now + client.get(dalli_key) while Time.now - start_time < bench_time + end + + Vernier.profile(out: 'socket_get_profile.json') do + start_time = Time.now + while Time.now - start_time < bench_time do + sock.write("get sock_key\r\n") + sock.readline + sock.read(payload.bytesize) + end + end +end + +if %w[all set].include?(bench_target) + Vernier.profile(out: 'client_set_profile.json') do + start_time = Time.now + client.set(dalli_key, payload, 3600, raw: true) while Time.now - start_time < bench_time + end + + Vernier.profile(out: 'socket_set_profile.json') do + start_time = Time.now + while Time.now - start_time < bench_time + sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n") + sock.write(payload) + sock.write("\r\n") + sock.flush + sock.readline # clear the buffer + end + end +end + +if %w[all get_multi].include?(bench_target) + Vernier.profile(out: 'client_get_multi_profile.json') do + start_time = Time.now + client.get_multi(pairs.keys) while Time.now - start_time < bench_time + end + + Vernier.profile(out: 'socket_get_multi_profile.json') do + start_time = Time.now + sock_get_multi(sock, pairs) while Time.now - start_time < bench_time + end +end + +if %w[all set_multi].include?(bench_target) + Vernier.profile(out: 'client_set_multi_profile.json') do + start_time = Time.now + client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time + end + + Vernier.profile(out: 'socket_set_multi_profile.json') do + start_time = Time.now + sock_set_multi(sock, pairs) while Time.now - start_time < bench_time + end +end From 15cb11d8c7b38c739a827a0a6fcf9b59ad76f1c9 Mon Sep 17 00:00:00 2001 From: Dan Mayer Date: Mon, 10 Feb 2025 22:11:26 -0700 Subject: [PATCH 2/4] fix rubocop complaints --- bin/benchmark | 24 +++++++++++++++--------- bin/profile | 19 +++++++++++++------ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/bin/benchmark b/bin/benchmark index a1cfe33d..96802ae2 100755 --- a/bin/benchmark +++ b/bin/benchmark @@ -33,11 +33,9 @@ class StringSerializer end end -dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211" +dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211' -if dalli_url.include?('unix') - ENV['BENCH_CACHE_URL'].gsub('unix://','') -end +ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix') bench_target = ENV['BENCH_TARGET'] || 'set' bench_time = (ENV['BENCH_TIME'] || 10).to_i bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i @@ -47,7 +45,8 @@ TERMINATOR = "\r\n" puts "yjit: #{RubyVM::YJIT.enabled?}" client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true) -multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, raw: true) +multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, + raw: true) # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions # in the library. @@ -106,6 +105,10 @@ class GCSuite end suite = GCSuite.new +# rubocop:disable Metrics/MethodLength +# rubocop:disable Metrics/PerceivedComplexity +# rubocop:disable Metrics/AbcSize +# rubocop:disable Metrics/CyclomaticComplexity def sock_get_multi(sock, pairs) count = pairs.length pairs.each_key do |key| @@ -129,13 +132,16 @@ def sock_get_multi(sock, pairs) end results end - +# rubocop:enable Metrics/MethodLength +# rubocop:enable Metrics/PerceivedComplexity +# rubocop:enable Metrics/AbcSize +# rubocop:enable Metrics/CyclomaticComplexity if %w[all set].include?(bench_target) Benchmark.ips do |x| x.config(warmup: bench_warmup, time: bench_time, suite: suite) x.report('client set') { client.set('key', payload) } - #x.report('multi client set') { multi_client.set('string_key', payload) } + # x.report('multi client set') { multi_client.set('string_key', payload) } x.report('raw sock set') do sock.write("ms sock_key #{payload.bytesize} T3600 MS\r\n") sock.write(payload) @@ -169,10 +175,10 @@ if %w[all get].include?(bench_target) begin value << sock.read_nonblock(count - value.bytesize) rescue Errno::EAGAIN - IO.select([sock]) + sock.wait_readable retry rescue EOFError - puts "EOFError" + puts 'EOFError' break end break if value.bytesize == count diff --git a/bin/profile b/bin/profile index f630d83c..33d10a64 100755 --- a/bin/profile +++ b/bin/profile @@ -33,11 +33,9 @@ class StringSerializer end end -dalli_url = ENV['BENCH_CACHE_URL'] || "127.0.0.1:11211" +dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211' -if dalli_url.include?('unix') - ENV['BENCH_CACHE_URL'].gsub('unix://','') -end +ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix') bench_target = ENV['BENCH_TARGET'] || 'get' bench_time = (ENV['BENCH_TIME'] || 10).to_i bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i @@ -78,6 +76,10 @@ client.quiet do end end +# rubocop:disable Metrics/MethodLength +# rubocop:disable Metrics/PerceivedComplexity +# rubocop:disable Metrics/AbcSize +# rubocop:disable Metrics/CyclomaticComplexity def sock_get_multi(sock, pairs) count = pairs.length pairs.each_key do |key| @@ -101,6 +103,9 @@ def sock_get_multi(sock, pairs) end results end +# rubocop:enable Metrics/PerceivedComplexity +# rubocop:enable Metrics/AbcSize +# rubocop:enable Metrics/CyclomaticComplexity def sock_set_multi(sock, pairs) count = pairs.length @@ -110,13 +115,15 @@ def sock_set_multi(sock, pairs) pairs.each do |key, value| count -= 1 tail = count.zero? ? '' : 'q' - sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n", capacity: key.size + value.bytesize + 40)) + sock.write(String.new("ms #{key} #{value.bytesize} c F0 T#{ttl} MS #{tail}\r\n", + capacity: key.size + value.bytesize + 40)) sock.write(value) sock.write(TERMINATOR) end sock.flush sock.gets(TERMINATOR) # clear the buffer end +# rubocop:enable Metrics/MethodLength if %w[all get].include?(bench_target) Vernier.profile(out: 'client_get_profile.json') do @@ -126,7 +133,7 @@ if %w[all get].include?(bench_target) Vernier.profile(out: 'socket_get_profile.json') do start_time = Time.now - while Time.now - start_time < bench_time do + while Time.now - start_time < bench_time sock.write("get sock_key\r\n") sock.readline sock.read(payload.bytesize) From d872856b4bc9fb46a42d47b39b19387db085cff9 Mon Sep 17 00:00:00 2001 From: Dan Mayer Date: Mon, 10 Feb 2025 22:17:52 -0700 Subject: [PATCH 3/4] leave out set_multi until we send that PR --- bin/benchmark | 13 +++++++------ bin/profile | 8 +++++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/bin/benchmark b/bin/benchmark index 96802ae2..70c520cd 100755 --- a/bin/benchmark +++ b/bin/benchmark @@ -45,8 +45,8 @@ TERMINATOR = "\r\n" puts "yjit: #{RubyVM::YJIT.enabled?}" client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true) -multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, - raw: true) +# multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, +# raw: true) # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions # in the library. @@ -208,9 +208,10 @@ if %w[all set_multi].include?(bench_target) end end end - x.report('multi client set_multi 100') do - multi_client.set_multi(pairs, 3600, raw: true) - end + # TODO: uncomment this once we add PR adding set_multi + # x.report('multi client set_multi 100') do + # multi_client.set_multi(pairs, 3600, raw: true) + # end x.report('write 100 keys rawsock') do count = pairs.length tail = '' @@ -226,7 +227,7 @@ if %w[all set_multi].include?(bench_target) sock.flush sock.gets(TERMINATOR) # clear the buffer end - x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) } + # x.report('write_mutli 100 keys') { client.set_multi(pairs, 3600, raw: true) } x.compare! end end diff --git a/bin/profile b/bin/profile index 33d10a64..46a02f04 100755 --- a/bin/profile +++ b/bin/profile @@ -174,7 +174,13 @@ end if %w[all set_multi].include?(bench_target) Vernier.profile(out: 'client_set_multi_profile.json') do start_time = Time.now - client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time + # until we port over set_multi, compare the simple loop + # client.set_multi(pairs, 3600, raw: true) while Time.now - start_time < bench_time + while Time.now - start_time < bench_time + pairs.each do |key, value| + client.set(key, value, 3600, raw: true) + end + end end Vernier.profile(out: 'socket_set_multi_profile.json') do From f51fbcc4bf348e29f44ce56e455889ee5eabb30f Mon Sep 17 00:00:00 2001 From: Dan Mayer Date: Tue, 11 Feb 2025 16:16:39 -0700 Subject: [PATCH 4/4] improvements to profile and benchmark based on feedback --- bin/benchmark | 48 +++++++++++++++++++++++++++++++++++------------- bin/profile | 29 +++++++++++++++++++---------- 2 files changed, 54 insertions(+), 23 deletions(-) diff --git a/bin/benchmark b/bin/benchmark index 70c520cd..8894fb5a 100755 --- a/bin/benchmark +++ b/bin/benchmark @@ -34,8 +34,6 @@ class StringSerializer end dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211' - -ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix') bench_target = ENV['BENCH_TARGET'] || 'set' bench_time = (ENV['BENCH_TIME'] || 10).to_i bench_warmup = (ENV['BENCH_WARMUP'] || 3).to_i @@ -44,9 +42,9 @@ payload = 'B' * bench_payload_size TERMINATOR = "\r\n" puts "yjit: #{RubyVM::YJIT.enabled?}" -client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false, raw: true) -# multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, -# raw: true) +client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false, raw: true) +multi_client = Dalli::Client.new('localhost:11211,localhost:11222', serializer: StringSerializer, compress: false, + raw: true) # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions # in the library. @@ -56,18 +54,29 @@ sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true) # Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size # sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8) # Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size -sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) +# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) # ensure the clients are all connected and working client.set('key', payload) +multi_client.set('multi_key', payload) sock.write("set sock_key 0 3600 #{payload.bytesize}\r\n") sock.write(payload) sock.write(TERMINATOR) sock.flush sock.readline # clear the buffer +raise 'dalli client mismatch' if payload != client.get('key') + +raise 'multi dalli client mismatch' if payload != multi_client.get('multi_key') + +sock.write("mg sock_key v\r\n") +sock.readline +sock_value = sock.read(payload.bytesize) +sock.read(TERMINATOR.bytesize) +raise 'sock mismatch' if payload != sock_value + # ensure we have basic data for the benchmarks and get calls -payload_smaller = 'B' * 50_000 +payload_smaller = 'B' * (bench_payload_size / 10) pairs = {} 100.times do |i| pairs["multi_#{i}"] = payload_smaller @@ -157,17 +166,22 @@ end if %w[all get].include?(bench_target) Benchmark.ips do |x| x.config(warmup: bench_warmup, time: bench_time, suite: suite) - x.report('get dalli') { client.get('key') } + x.report('get dalli') do + result = client.get('key') + raise 'mismatch' unless result == payload + end # NOTE: while this is the fastest it is not thread safe and is blocking vs IO sharing friendly x.report('get sock') do - sock.write("get sock_key\r\n") + sock.write("mg sock_key v\r\n") sock.readline - sock.read(payload.bytesize) + result = sock.read(payload.bytesize) + sock.read(TERMINATOR.bytesize) + raise 'mismatch' unless result == payload end # NOTE: This shows that when adding thread safety & non-blocking IO we are slower for single process/thread use case x.report('get sock non-blocking') do @lock.synchronize do - sock.write("get sock_key\r\n") + sock.write("mg sock_key v\r\n") sock.readline count = payload.bytesize value = String.new(capacity: count + 1) @@ -183,6 +197,8 @@ if %w[all get].include?(bench_target) end break if value.bytesize == count end + sock.read(TERMINATOR.bytesize) + raise 'mismatch' unless value == payload end end x.compare! @@ -192,8 +208,14 @@ end if %w[all get_multi].include?(bench_target) Benchmark.ips do |x| x.config(warmup: bench_warmup, time: bench_time, suite: suite) - x.report('get 100 keys') { client.get_multi(pairs.keys) } - x.report('get 100 keys raw sock') { sock_get_multi(sock, pairs) } + x.report('get 100 keys') do + result = client.get_multi(pairs.keys) + raise 'mismatch' unless result == pairs + end + x.report('get 100 keys raw sock') do + result = sock_get_multi(sock, pairs) + raise 'mismatch' unless result == pairs + end x.compare! end end diff --git a/bin/profile b/bin/profile index 46a02f04..cd2b32c6 100755 --- a/bin/profile +++ b/bin/profile @@ -34,15 +34,13 @@ class StringSerializer end dalli_url = ENV['BENCH_CACHE_URL'] || '127.0.0.1:11211' - -ENV['BENCH_CACHE_URL'].gsub('unix://', '') if dalli_url.include?('unix') bench_target = ENV['BENCH_TARGET'] || 'get' bench_time = (ENV['BENCH_TIME'] || 10).to_i bench_payload_size = (ENV['BENCH_PAYLOAD_SIZE'] || 700_000).to_i TERMINATOR = "\r\n" puts "yjit: #{RubyVM::YJIT.enabled?}" -client = Dalli::Client.new('localhost', serializer: StringSerializer, compress: false) +client = Dalli::Client.new(dalli_url, serializer: StringSerializer, compress: false) # The raw socket implementation is used to benchmark the performance of dalli & the overhead of the various abstractions # in the library. @@ -52,7 +50,7 @@ sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true) # Benchmarks didn't see any performance gains from increasing the SO_RCVBUF buffer size # sock.setsockopt(Socket::SOL_SOCKET, ::Socket::SO_RCVBUF, 1024 * 1024 * 8) # Benchamrks did see an improvement in performance when increasing the SO_SNDBUF buffer size -sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) +# sock.setsockopt(Socket::SOL_SOCKET, Socket::SO_SNDBUF, 1024 * 1024 * 8) payload = 'B' * bench_payload_size dalli_key = 'dalli_key' @@ -65,7 +63,7 @@ sock.flush sock.readline # clear the buffer # ensure we have basic data for the benchmarks and get calls -payload_smaller = 'B' * 50_000 +payload_smaller = 'B' * (bench_payload_size / 10) pairs = {} 100.times do |i| pairs["multi_#{i}"] = payload_smaller @@ -128,15 +126,20 @@ end if %w[all get].include?(bench_target) Vernier.profile(out: 'client_get_profile.json') do start_time = Time.now - client.get(dalli_key) while Time.now - start_time < bench_time + while Time.now - start_time < bench_time + result = client.get(dalli_key) + raise 'mismatch' unless result == payload + end end Vernier.profile(out: 'socket_get_profile.json') do start_time = Time.now while Time.now - start_time < bench_time - sock.write("get sock_key\r\n") + sock.write("mg sock_key v\r\n") sock.readline - sock.read(payload.bytesize) + result = sock.read(payload.bytesize) + sock.read(TERMINATOR.bytesize) + raise 'mismatch' unless result == payload end end end @@ -162,12 +165,18 @@ end if %w[all get_multi].include?(bench_target) Vernier.profile(out: 'client_get_multi_profile.json') do start_time = Time.now - client.get_multi(pairs.keys) while Time.now - start_time < bench_time + while Time.now - start_time < bench_time + result = client.get_multi(pairs.keys) + raise 'mismatch' unless result == pairs + end end Vernier.profile(out: 'socket_get_multi_profile.json') do start_time = Time.now - sock_get_multi(sock, pairs) while Time.now - start_time < bench_time + while Time.now - start_time < bench_time + result = sock_get_multi(sock, pairs) + raise 'mismatch' unless result == pairs + end end end