Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not increase failed_rpc_count when Rpc status is False #1143

Open
wants to merge 4 commits into
base: v2.10.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions core/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,11 +447,10 @@

res = []
for node_id in node_ids:
if str(skale.nodes.get_node_status(node_id)) == str(NodeStatus.ACTIVE.value):
ip_bytes = skale.nodes.contract.functions.getNodeIP(
node_id).call()
ip = ip_from_bytes(ip_bytes)
res.append([node_id, ip, is_port_open(ip, WATCHDOG_PORT)])
ip_bytes = skale.nodes.contract.functions.getNodeIP(

Check warning on line 450 in core/node.py

View check run for this annotation

Codecov / codecov/patch

core/node.py#L450

Added line #L450 was not covered by tests
node_id).call()
ip = ip_from_bytes(ip_bytes)
res.append([node_id, ip, is_port_open(ip, WATCHDOG_PORT)])

Check warning on line 453 in core/node.py

View check run for this annotation

Codecov / codecov/patch

core/node.py#L452-L453

Added lines #L452 - L453 were not covered by tests
logger.info(f'validator_nodes check - node_id: {node_id}, res: {res}')
except Exception as err:
return {'status': 1, 'errors': [err]}
Expand Down
27 changes: 12 additions & 15 deletions core/schains/monitor/rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,50 +26,47 @@
from tools.docker_utils import DockerUtils

from tools.configs.schains import MAX_SCHAIN_FAILED_RPC_COUNT
from tools.configs.containers import (
MAX_SCHAIN_RESTART_COUNT,
SCHAIN_CONTAINER
)
from tools.configs.containers import MAX_SCHAIN_RESTART_COUNT, SCHAIN_CONTAINER

logger = logging.getLogger(__name__)


def handle_failed_schain_rpc(
schain: SchainStructure,
schain_record,
skaled_status,
dutils=None
):
def handle_failed_schain_rpc(schain: SchainStructure, schain_record, skaled_status, dutils=None):
dutils = dutils or DockerUtils()
logger.info(f'Monitoring RPC for sChain {schain.name}')

if not is_container_exists(schain.name, dutils=dutils):
logger.warning(f'{schain.name} RPC monitor failed: container doesn\'t exit')
logger.warning('RPC monitor failed: container does not exist')
return

if not is_container_running(schain.name, dutils=dutils):
logger.warning(f'{schain.name} RPC monitor failed: container is not running')
logger.warning('RPC monitor failed: container is not running')
return

if skaled_status.exit_time_reached:
logger.info(f'{schain.name} - Skipping RPC monitor: exit time reached')
logger.info('Skipping RPC monitor: exit time reached')

Check warning on line 47 in core/schains/monitor/rpc.py

View check run for this annotation

Codecov / codecov/patch

core/schains/monitor/rpc.py#L47

Added line #L47 was not covered by tests
skaled_status.log()
schain_record.set_failed_rpc_count(0)
return

if skaled_status.downloading_snapshot:
logger.info(f'{schain.name} - Skipping RPC monitor: downloading snapshot')
logger.info('Skipping RPC monitor: downloading snapshot')
skaled_status.log()
schain_record.set_failed_rpc_count(0)
return

if not skaled_status.subsystem_running['Rpc']:
logger.info('Skipping RPC monitor: Rpc has not been initialized')
skaled_status.log()
schain_record.set_failed_rpc_count(0)

rpc_stuck = schain_record.failed_rpc_count > MAX_SCHAIN_FAILED_RPC_COUNT
logger.info(
'SChain %s, rpc stuck: %s, failed_rpc_count: %d, restart_count: %d',
schain.name,
rpc_stuck,
schain_record.failed_rpc_count,
schain_record.restart_count
schain_record.restart_count,
)
if rpc_stuck:
if schain_record.restart_count < MAX_SCHAIN_RESTART_COUNT:
Expand Down
35 changes: 3 additions & 32 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
ENV_TYPE,
META_FILEPATH,
SSL_CERTIFICATES_FILEPATH,
STATIC_GROUPS_FOLDER
STATIC_GROUPS_FOLDER,
)
from tools.configs.containers import CONTAINERS_FILEPATH
from tools.configs.ima import SCHAIN_IMA_ABI_FILEPATH
Expand All @@ -81,6 +81,7 @@
init_skale_from_wallet,
init_skale_ima,
upsert_schain_record_with_config,
generate_schain_skaled_status_file,
)

NUMBER_OF_NODES = 2
Expand Down Expand Up @@ -197,28 +198,6 @@ def get_random_string(length=8):
return ''.join(random.choice(letters) for i in range(length))


def get_skaled_status_dict(
snapshot_downloader=False,
exit_time_reached=False,
clear_data_dir=False,
start_from_snapshot=False,
start_again=False,
):
return {
'subsystemRunning': {
'SnapshotDownloader': snapshot_downloader,
'Blockchain': False,
'Rpc': False,
},
'exitState': {
'ClearDataDir': clear_data_dir,
'StartAgain': start_again,
'StartFromSnapshot': start_from_snapshot,
'ExitTimeReached': exit_time_reached,
},
}


SECRET_KEY = {
'common_public_key': [
11111111111111111111111111111111111111111111111111111111111111111111111111111,
Expand Down Expand Up @@ -305,13 +284,6 @@ def schain_config(_schain_name, secret_key, predeployed_ima):
rm_schain_dir(_schain_name)


def generate_schain_skaled_status_file(_schain_name, **kwargs):
schain_dir_path = os.path.join(SCHAINS_DIR_PATH, _schain_name)
pathlib.Path(schain_dir_path).mkdir(parents=True, exist_ok=True)
status_filepath = skaled_status_filepath(_schain_name)
write_json(status_filepath, get_skaled_status_dict(**kwargs))


def rm_schain_dir(schain_name):
schain_dir_path = os.path.join(SCHAINS_DIR_PATH, schain_name)
# fix permission denied after schain container running
Expand Down Expand Up @@ -607,8 +579,7 @@ def static_groups_for_schain(_schain_name):
parent_folder = os.path.join(STATIC_GROUPS_FOLDER, ENV_TYPE)
os.makedirs(parent_folder)
static_groups_env_path = os.path.join(
parent_folder,
os.path.join(f'schain-{_schain_name}.json')
parent_folder, os.path.join(f'schain-{_schain_name}.json')
)
try:
write_json(static_groups_env_path, STATIC_NODE_GROUPS)
Expand Down
38 changes: 30 additions & 8 deletions tests/schains/monitor/rpc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from core.schains.monitor.rpc import handle_failed_schain_rpc
from core.schains.runner import get_container_info
from core.schains.rpc import check_endpoint_blocks
from tools.configs.containers import SCHAIN_CONTAINER
from tools.configs.containers import SCHAIN_CONTAINER, MAX_SCHAIN_RESTART_COUNT
from tools.configs.schains import MAX_SCHAIN_FAILED_RPC_COUNT
from web.models.schain import SChainRecord
from tests.utils import get_schain_struct
from tests.utils import get_schain_struct, generate_schain_skaled_status_file

CURRENT_TIMESTAMP = 1594903080
CURRENT_DATETIME = datetime.datetime.utcfromtimestamp(CURRENT_TIMESTAMP)
Expand Down Expand Up @@ -39,7 +40,7 @@ def test_handle_failed_schain_rpc_exit_time_reached(

dutils.run_container(image_name=image_name, name=container_name, entrypoint='bash -c "exit 0"')
time.sleep(7)
schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']
Expand Down Expand Up @@ -67,7 +68,7 @@ def test_monitor_schain_downloading_snapshot(
image_name=image_name, name=container_name, entrypoint='bash -c "sleep 100"'
)
time.sleep(7)
schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']
Expand All @@ -91,8 +92,8 @@ def test_handle_failed_schain_rpc_stuck_max_retries(
image_name=image_name, name=container_name, entrypoint='bash -c "sleep 100"'
)

schain_record.set_failed_rpc_count(100)
schain_record.set_restart_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)
schain_record.set_restart_count(MAX_SCHAIN_RESTART_COUNT + 1)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']
Expand All @@ -116,7 +117,7 @@ def test_monitor_container_exited(schain_db, dutils, cleanup_schain_containers,
# Wait for container initialization
time.sleep(2)

schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)
schain_record.set_restart_count(0)

container_info = dutils.get_info(container_name)
Expand Down Expand Up @@ -145,12 +146,33 @@ def test_handle_failed_schain_rpc_stuck(
image_name=image_name, name=container_name, entrypoint='bash -c "sleep 100"'
)

schain_record.set_failed_rpc_count(100)
schain_record.set_failed_rpc_count(MAX_SCHAIN_FAILED_RPC_COUNT)
schain_record.set_restart_count(0)

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']

assert schain_record.restart_count == 0

# Make sure restart is not executed with Rpc: False in status file
generate_schain_skaled_status_file(schain_db, rpc=False)
handle_failed_schain_rpc(
schain=get_schain_struct(schain_name=schain_db),
schain_record=schain_record,
skaled_status=skaled_status,
dutils=dutils,
)
assert schain_record.restart_count == 0
container_info = dutils.get_info(container_name)
assert container_info['stats']['State']['FinishedAt'] == finished_at

container_info = dutils.get_info(container_name)
finished_at = container_info['stats']['State']['FinishedAt']

# With Rpc: True restart should be executed
generate_schain_skaled_status_file(schain_db, rpc=True)
schain_record.set_failed_rpc_count(100)

assert schain_record.restart_count == 0
handle_failed_schain_rpc(
schain=get_schain_struct(schain_name=schain_db),
Expand Down
4 changes: 2 additions & 2 deletions tests/schains/skaled_status_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def test_skaled_status(skaled_status, _schain_name):

assert skaled_status.subsystem_running == {
'SnapshotDownloader': False,
'Blockchain': False,
'Rpc': False,
'Blockchain': True,
'Rpc': True,
}

assert skaled_status.exit_state == {
Expand Down
Loading