Skip to content

Commit

Permalink
Merge pull request #28 from moonlibs/fix-fencing
Browse files Browse the repository at this point in the history
Fixes fencing timeouts
  • Loading branch information
ochaton authored Aug 17, 2023
2 parents 0caeb50 + 2582d6b commit 87ce6e2
Show file tree
Hide file tree
Showing 14 changed files with 432 additions and 31 deletions.
3 changes: 2 additions & 1 deletion .luacheckrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ std = "tarantool"
max_line_length = 200
codes = true
include_files = {"config.lua", "config/"}
ignore = {"212"}
read_globals = {"config"}
ignore = {"212"}
5 changes: 5 additions & 0 deletions Dockerfile.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM tarantool/tarantool:2.10

RUN apk add -u git cmake make gcc musl-dev
RUN tarantoolctl rocks install luatest 0.5.7
RUN tarantoolctl rocks install luacov-console
18 changes: 18 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.PHONY := all test

run-compose:
make -C test run-compose

build-testing-image:
docker build -t config-test -f Dockerfile.test .

test: build-testing-image run-compose
docker run --name config-test \
--net tt_net \
-e TT_ETCD_ENDPOINTS="http://etcd0:2379,http://etcd1:2379,http://etcd2:2379" \
--rm -v $$(pwd):/source/config \
-v $$(pwd)/data:/tmp/ \
--workdir /source/config \
--entrypoint '' \
config-test \
./run_test_in_docker.sh
47 changes: 37 additions & 10 deletions config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,7 @@ local M
cfg.box.replication_connect_quorum = optimal_rcq(cfg.box.replication)
end
log.info("Start non-bootstrapped tidy loading with ro=%s rcq=%s rct=%s (dir=%s)",
cfg.box.read_only, snap_dir, cfg.box.replication_connect_quorum, cfg.box.replication_connect_timeout)
cfg.box.read_only, cfg.box.replication_connect_quorum, cfg.box.replication_connect_timeout, snap_dir)
end
end

Expand Down Expand Up @@ -1003,17 +1003,31 @@ local M
end

local function fencing_check(deadline)
local timeout = math.min((deadline-fiber.time()), fencing_pause)
-- we can only allow half of the time till deadline
local timeout = math.min((deadline-fiber.time())*0.5, fencing_pause)

local check_started = fiber.time()
local pcall_ok, err_or_resolution, new_cluster = pcall(function()
local started = fiber.time()
local n_endpoints = #config.etcd.endpoints
local not_timed_out, response = config.etcd:wait(watch_path, {
index = watch_index,
timeout = timeout,
timeout = timeout/n_endpoints,
})
log.verbose("[fencing] wait(%s,index=%s,timeout=%.3fs) => %s (ind:%s) %s",
local logger
if not_timed_out then
if tonumber(response.status) and tonumber(response.status) >= 400 then
logger = log.error
else
logger = log.info
end
else
logger = log.verbose
end
logger("[fencing] wait(%s,index=%s,timeout=%.3fs) => %s (ind:%s) %s took %.3fs",
watch_path, watch_index, timeout,
response.status, response.headers['x-etcd-index'],
json.encode(response.body))
response.status, (response.headers or {})['x-etcd-index'],
json.encode(response.body), fiber.time()-started)

-- http timed out / or network drop - we'll never know
if not not_timed_out then return 'timeout' end
Expand All @@ -1029,7 +1043,7 @@ local M
if res.node then
local node = {}
config.etcd:recursive_extract(watch_path, res.node, node)
log.verbose("[fencing] watch index changed: %s => %s", watch_path, json.encode(node))
log.info("[fencing] watch index changed: %s => %s", watch_path, json.encode(node))
if not node.master then node = nil end
return 'changed', node
end
Expand All @@ -1046,7 +1060,6 @@ local M
end

if not new_cluster then
local sleep = math.max(fencing_pause / 2, (deadline - fiber.time()) / 2)
repeat
local ok, e_cluster = pcall(refresh_list)
if ok and e_cluster then
Expand All @@ -1055,6 +1068,8 @@ local M
end

if not in_my_gen() then return end
-- we can only sleep 50% till deadline will be reached
local sleep = math.min(fencing_pause, 0.5*(deadline - fiber.time()))
fiber.sleep(sleep)
until fiber.time() > deadline
end
Expand Down Expand Up @@ -1153,7 +1168,9 @@ local M
-- Before ETCD check we better pause
-- we do a little bit randomized sleep to not spam ETCD
fiber.sleep(
math.random(0, (fencing_timeout - fencing_pause) / 10)
math.random(0,
0.1*math.min(deadline-fiber.time(),fencing_timeout-fencing_pause)
)
)
-- After each yield we have to check that we are still in our generation
if not in_my_gen() then return end
Expand All @@ -1166,15 +1183,25 @@ local M
-- then we update leadership leasing
if fencing_check(deadline) then
-- update deadline.
if deadline <= fiber.time() then
log.warn("[fencing] deadline was overflowed deadline:%s, now:%s",
deadline, fiber.time()
)
end
log.verbose("[fencing] Leasing ft:%.3fs up:%.3fs left:%.3fs",
fencing_timeout,
fiber.time()+fencing_timeout-deadline,
deadline - fiber.time()
)
deadline = fiber.time()+fencing_timeout
end

if not in_my_gen() then return end

if deadline <= fiber.time() then
log.warn("[fencing] deadline has not been upgraded deadline:%s, now:%s",
deadline, fiber.time()
)
end
until box.info.ro or fiber.time() > deadline

-- We have left deadline-loop. It means that fencing is required
Expand Down
5 changes: 5 additions & 0 deletions run_test_in_docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/sh

pwd
rm -rf /root/.cache/
.rocks/bin/luatest --coverage -c -v spec/01_single_test.lua
149 changes: 149 additions & 0 deletions spec/01_single_test.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
local t = require 'luatest' --[[@as luatest]]
local uri = require 'uri'

local base_config = {
apps = {
single = {
common = { box = { log_level = 4 } },
}
}
}

local g = t.group('single', {
{ instances = {single = '127.0.0.1:3301'}, run = {'single'} },
{
instances = {single_01 = '127.0.0.1:3301', single_02 = '127.0.0.1:3302'},
run = {'single_01', 'single_02'}
},
{
instances = {single_01 = '127.0.0.1:3301', single_02 = '127.0.0.1:3302'},
run = {'single_01'}
},
})

local this_file = debug.getinfo(1, "S").source:sub(2)
local fio = require 'fio'

local root = fio.dirname(this_file)
local init_lua = fio.pathjoin(root, 'mock', 'single', 'init.lua')

local base_env = {
TT_WAL_DIR = nil, -- will be set at before_each trigger
TT_MEMTX_DIR = nil, -- will be set at before_each trigger
TT_ETCD_PREFIX = '/apps/single',
TT_CONFIG = fio.pathjoin(root, 'mock', 'single', 'conf.lua'),
TT_MASTER_SELECTION_POLICY = 'etcd.instance.single',
TT_ETCD_ENDPOINTS = os.getenv('TT_ETCD_ENDPOINTS') or "http://127.0.0.1:2379",
}

local h = require 'spec.helper'
local test_ctx = {}

local working_dir

g.before_each(function()
working_dir = h.create_workdir()
base_env.TT_WAL_DIR = working_dir
base_env.TT_MEMTX_DIR = working_dir
end)

g.after_each(function()
for _, info in pairs(test_ctx) do
for _, tt in pairs(info.tts) do
tt.tt:stop()
end
end

h.clean_directory(working_dir)
h.clear_etcd()
end)

function g.test_run_instances(cg)
local params = cg.params
local this_ctx = { tts = {} }
test_ctx[cg.name] = this_ctx

local etcd_config = table.deepcopy(base_config)
etcd_config.apps.single.instances = {}
for instance_name, listen_uri in pairs(params.instances) do
etcd_config.apps.single.instances[instance_name] = { box = { listen = listen_uri } }
end

h.upload_to_etcd(etcd_config)

for _, name in ipairs(params.run) do
local env = table.deepcopy(base_env)
env.TT_INSTANCE_NAME = name
local net_box_port = tonumber(uri.parse(etcd_config.apps.single.instances[name].box.listen).service)

local tt = h.start_tarantool({
alias = name,
env = env,
command = init_lua,
args = {},
net_box_port = net_box_port,
})

table.insert(this_ctx.tts, {
tt = tt,
net_box_port = net_box_port,
env = env,
name = name,
})
end

for _, tt in ipairs(this_ctx.tts) do
tt.tt:connect_net_box()
local box_cfg = tt.tt:get_box_cfg()
t.assert_covers(box_cfg, {
log_level = etcd_config.apps.single.common.box.log_level,
listen = etcd_config.apps.single.instances[tt.name].box.listen,
read_only = false,
}, 'box.cfg is correct')

local conn = tt.tt --[[@as luatest.server]]
local ret = conn:exec(function()
local r = table.deepcopy(config.get('sys'))
for k, v in pairs(r) do
if type(v) == 'function' then
r[k] = nil
end
end
return r
end)

t.assert_covers(ret, {
instance_name = tt.name,
master_selection_policy = 'etcd.instance.single',
file = base_env.TT_CONFIG,
}, 'get("sys") has correct fields')
end

for _, tt in ipairs(this_ctx.tts) do
local conn = tt.tt --[[@as luatest.server]]
h.restart_tarantool(conn)

local box_cfg = tt.tt:get_box_cfg()
t.assert_covers(box_cfg, {
log_level = etcd_config.apps.single.common.box.log_level,
listen = etcd_config.apps.single.instances[tt.name].box.listen,
read_only = false,
}, 'box.cfg is correct after restart')

local ret = conn:exec(function()
local r = table.deepcopy(config.get('sys'))
for k, v in pairs(r) do
if type(v) == 'function' then
r[k] = nil
end
end
return r
end)

t.assert_covers(ret, {
instance_name = tt.name,
master_selection_policy = 'etcd.instance.single',
file = base_env.TT_CONFIG,
}, 'get("sys") has correct fields after restart')
end
end
Loading

0 comments on commit 87ce6e2

Please sign in to comment.