diff --git a/install/scripts.d/ta/195_ofed.sh b/install/scripts.d/ta/195_ofed.sh index 167e117..44209d8 100755 --- a/install/scripts.d/ta/195_ofed.sh +++ b/install/scripts.d/ta/195_ofed.sh @@ -2,23 +2,24 @@ DESCRIPTION="Check if Mellanox OFED is installed" SCRIPT_TYPE="parallel" +RETURN_CODE=0 # fail immediately if no OFED installed if ! ofed_info -n &> /dev/null; then - echo "OFED not installed" + echo "WARN: OFED not installed" exit 254 fi # is it a supported OFED version? OFEDVER=$(ofed_info -n) -case "$OFEDVER" in +case "$OFEDVER" in 5.1-2.5.8.0 | 5.1-2.6.2.0 | 5.4-3.4.0.0 | 5.4-3.5.8.0 | 5.6-1.0.3.3 | 5.6-2.0.9.0 | 5.7-1.0.2.0 | 5.8-1.1.2.1 | 5.8-3.0.7.0 | 5.9-0.5.6.0 | 23.04-1.1.3.0 | 23.10-0.5.5.0 ) #continue ;; *) - echo "Unsupported OFED version $OFEDVER" - exit 254 + echo "WARN: Unsupported OFED version $OFEDVER" + RETURN_CODE=254 ;; esac @@ -28,17 +29,20 @@ esac if modinfo mlx5_core &> /dev/null; then MLX5_VER=$(modinfo mlx5_core | awk '/^version:/{ print $2 }') else - echo "No mlx5_core kernel module loaded" - exit 254 + echo "WARN: No mlx5_core kernel module loaded" + RETURN_CODE=254 fi # make sure loaded drivers match the installed OFED if [[ -n "$MLX5_VER" ]]; then if [[ "$MLX5_VER" != "${OFEDVER:0:9}" ]]; then - echo "Loaded Mellanox driver $MLX5_VER does not match OFED version $OFEDVER!" - exit 254 + echo "WARN: Loaded Mellanox driver $MLX5_VER does not match OFED version $OFEDVER!" + RETURN_CODE=254 fi fi -echo "Valid OFED configuration observed" -exit 0 +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "Valid OFED configuration observed" +fi + +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/270_weka_local_resources_gateways.sh b/install/scripts.d/ta/270_weka_local_resources_gateways.sh index e6c1305..e9941ef 100755 --- a/install/scripts.d/ta/270_weka_local_resources_gateways.sh +++ b/install/scripts.d/ta/270_weka_local_resources_gateways.sh @@ -8,34 +8,42 @@ SCRIPT_TYPE="parallel" RETURN_CODE=0 -for WEKA_CONTAINER in $(sudo weka local ps --output name --no-header); do - if [[ ( ${WEKA_CONTAINER} == "ganesha" ) || \ - ( ${WEKA_CONTAINER} == "samba" ) || \ - ( ${WEKA_CONTAINER} == "smb" ) ]] ; then - continue - fi - # Look for network devices with no gateway - RESOURCES=$(sudo weka local resources --container ${WEKA_CONTAINER} --json) - NUMBER_OF_DEVICES_WITH_NO_GATEWAY=$(echo ${RESOURCES} | python3 -c 'import sys, json; data = json.load(sys.stdin); print(len([device for device in data["net_devices"] if device["gateway"] == ""]))') - - - if [[ ${NUMBER_OF_DEVICES_WITH_NO_GATEWAY} -ne 0 ]] ; then - DEVICES_WITH_NO_GATEWAY=$(echo ${RESOURCES} | python3 -c 'import sys, json; data = json.load(sys.stdin); print("\n".join([device["name"] for device in data["net_devices"] if device["gateway"] == ""]))') - echo "The container ${WEKA_CONTAINER} has the following network devices defined without an IP" - echo "gateway - this might not be a mistake but means Weka POSIX traffic will not" - echo "leave this subnet" - echo "" - echo ${DEVICES_WITH_NO_GATEWAY} - echo "" +for WEKA_CONTAINER in $(sudo weka local ps --output name --no-header | grep -vw -e envoy -e ganesha -e samba -e smbw -e s3); do + DEVICES_WITH_NO_GATEWAY="" + NET_DEVICE="" + NET_GATEWAY="" + + while read NET_ENTRY; do + if [[ ${NET_ENTRY} =~ "gateway:"(.*)"name:"(.*) ]]; then + NET_GATEWAY=${BASH_REMATCH[1]} + NET_DEVICE=${BASH_REMATCH[2]} + fi + + if [[ -n ${NET_DEVICE} ]]; then + if [[ -d /sys/class/net/${NET_DEVICE} ]]; then + NET_TYPE=$(cat /sys/class/net/${NET_DEVICE}/type) + if [[ -n ${NET_TYPE} && ${NET_TYPE} == "1" ]]; then # Only check ethernet devices + if [[ -z ${NET_GATEWAY} ]]; then + DEVICES_WITH_NO_GATEWAY="${DEVICES_WITH_NO_GATEWAY}${NET_DEVICE} " + fi + fi + fi + fi + done < <(weka local resources -C ${WEKA_CONTAINER} net --stable -J | grep -w -e gateway -e name | paste - - | tr -d \"\,[:blank:]) + + if [[ -n ${DEVICES_WITH_NO_GATEWAY} ]]; then + echo "The container ${WEKA_CONTAINER} has the network device(s) ${DEVICES_WITH_NO_GATEWAY}" + echo "defined without an IP gateway - this might not be a mistake but means Weka" + echo "POSIX traffic will not leave this subnet." echo "The likely fix for this is to do weka local resources net remove for each device," echo "then add back in with weka local resource net add --gateway ... --netmask .." - #RETURN_CODE=254 - exit 254 - fi + echo + RETURN_CODE=254 + fi done + if [[ ${RETURN_CODE} -eq 0 ]]; then echo "All Weka containers have network devices with gateways" fi -exit ${RETURN_CODE} - +exit ${RETURN_CODE} \ No newline at end of file diff --git a/install/scripts.d/ta/290_check_traces_free_space.sh b/install/scripts.d/ta/290_check_traces_free_space.sh index 3f8b297..ed556d9 100755 --- a/install/scripts.d/ta/290_check_traces_free_space.sh +++ b/install/scripts.d/ta/290_check_traces_free_space.sh @@ -52,7 +52,10 @@ if (( ${WEKA_ENSURE_FREE} > ${TRACES_FS_SIZE})) ; then echo "Weka is currently set to ensure that ${WEKA_ENSURE_FREE} bytes are free" echo "on ${WEKA_TRACES_DIR}, but this filesystem is only ${TRACES_FS_SIZE} bytes" echo "in size. These conditions cannot co-exist, so the outcome is that no" - echo "traces will be stored" + echo "traces will be stored." + echo "Recommended options:" + echo " . Increase the size of ${WEKA_TRACES_DIR}" + echo " . Reduce the size of traces with \"weka debug traces retention set --server-ensure-free XXXX\"" RETURN_CODE=1 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/390_data_folder.sh b/install/scripts.d/ta/390_data_folder.sh index bed2b92..e0a8039 100755 --- a/install/scripts.d/ta/390_data_folder.sh +++ b/install/scripts.d/ta/390_data_folder.sh @@ -29,6 +29,7 @@ if [ -d "/data" ] ; then else echo "to ${JIRA_REFERENCE}, SFDC ${KB_REFERENCE}" fi + echo "The recommend fix is to upgrade your version of Weka" RETURN_CODE=1 fi fi diff --git a/install/scripts.d/ta/400_s3_using_etcd.sh b/install/scripts.d/ta/400_s3_using_etcd.sh index 6365e61..fb38e85 100755 --- a/install/scripts.d/ta/400_s3_using_etcd.sh +++ b/install/scripts.d/ta/400_s3_using_etcd.sh @@ -40,12 +40,14 @@ if [ ${WEKA_S3_RUNNING} -ge 1 ] ; then if verlte ${MIN_VERSION} ${WEKA_VERSION} && verlte ${WEKA_VERSION} ${MAX_VERSION} ; then WEKA_ETCD_HOSTS=$(weka s3 cluster --json | python3 -c 'import sys, json; data = json.load(sys.stdin); print(len(data["etcd_cluster_hosts"]))') if [ ${WEKA_ETCD_HOSTS} -gt 0 ] ; then - echo "S3 cluster is running, and this version of Weka requires migration" + echo "S3 cluster is running, and this version of Weka requires a configuration change." if [[ ! -z "${WTA_REFERENCE}" ]]; then - echo "to ${JIRA_REFERENCE}, discussed in ${WTA_REFERENCE}, SFDC ${KB_REFERENCE}" + echo "Refer to ${JIRA_REFERENCE}, discussed in ${WTA_REFERENCE}, SFDC ${KB_REFERENCE}" else - echo "to ${JIRA_REFERENCE}, SFDC ${KB_REFERENCE}" + echo "Refer to ${JIRA_REFERENCE}, SFDC ${KB_REFERENCE}" fi + echo "If you require the S3 service, please contact Customer Success indicating" + echo " you need to move the S3 service from ETCD to KWAS, as indicated in KB 1181" RETURN_CODE=254 fi fi diff --git a/install/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh b/install/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh index 751199b..7f76ba3 100755 --- a/install/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh +++ b/install/scripts.d/ta/420_check_cross-numa_zone_memory_balance.sh @@ -37,6 +37,8 @@ if [[ ${RATIO_SEEN} -gt ${MAX_ALLOWED_RATIO} ]]; then echo "from starting due to lack of NUMA zone-local memory" echo "The ratio is ${RATIO_SEEN}% and the maximum allowed ratio is ${MAX_ALLOWED_RATIO}%" echo "The memory in the highest zone is ${MAX_MEMORY_SEEN_KB} and in the lowest zone is ${MIN_MEMORY_SEEN_KB}" + echo "One recommend resolution is to balance the memory between NUMA zones by physically" + echo "moving memory, or by adding more to the smaller NUMA zone" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh b/install/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh index db1e332..9978f9b 100755 --- a/install/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh +++ b/install/scripts.d/ta/430_nvme_used_capacity_vs_maximum.sh @@ -27,6 +27,13 @@ echo ${WEKA_SSD_USED_BYTES} # if we've allocated more than half the maximum theoretical SSD space, warn if [[ $((${WEKA_SSD_USED_BYTES}*2)) -gt ${WEKA_THEORETICAL_MAX_SSD_BYTES} ]] ; then + echo "You have used a significant proportion of the theoretical maximum" + echo "NVME capacity of the cluster which is decided at first install time." + echo "Please contact customer success to discuss options. Possible actions include:" + echo " . Adding an Object Store to expand data storage while keeping NVME capacity down" + echo " . In-place cluster resizing and migration (perhaps via snap2obj for fast backup/restore)" + echo " . Migrating to a different, larger cluster" + echo " . Pruning unnecessary data" RETURN_CODE=254 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/440_hostnames_rfc952.sh b/install/scripts.d/ta/440_hostnames_rfc952.sh index b778a31..7ebaa80 100644 --- a/install/scripts.d/ta/440_hostnames_rfc952.sh +++ b/install/scripts.d/ta/440_hostnames_rfc952.sh @@ -23,6 +23,7 @@ GREP_RESULT=$(echo ${SHORT_HOSTNAME} | grep "[^-a-z0-9.]") if [[ $? -eq 0 ]]; then echo "The hostname ${SHORT_HOSTNAME} appears to contain a character other than [a-z], -, and [0-9]." echo "Refer to RFC 952 for more information" + echo "Recommended resolution: change the hostname to include only alphanumerics and underscores" RETURN_CODE=254 fi diff --git a/install/scripts.d/ta/450_custom_ca_certs.sh b/install/scripts.d/ta/450_custom_ca_certs.sh index 546e68e..8ca8391 100755 --- a/install/scripts.d/ta/450_custom_ca_certs.sh +++ b/install/scripts.d/ta/450_custom_ca_certs.sh @@ -15,6 +15,8 @@ grep -q SSL_CERT_FILE /opt/weka/dist/release/${WEKA_VERSION}.spec 2>/dev/null if [[ $? -eq 0 ]] ; then echo "This version of weka appears to use custom CA certificates. Care will be needed for upgrading" + echo "Recommended resolution: remove custom CA specification, and upgrade to a more recent" + echo "version that natively supports additional CA bundles" RETURN_CODE=254 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/460_ip_source-based_routing.sh b/install/scripts.d/ta/460_ip_source-based_routing.sh index 7b1519a..02b5a0a 100755 --- a/install/scripts.d/ta/460_ip_source-based_routing.sh +++ b/install/scripts.d/ta/460_ip_source-based_routing.sh @@ -78,6 +78,9 @@ if [[ ${SOURCE_BASED_ROUTING_RECOMMENDED} -ge "1" ]] ; then echo "Warning: Not every interface appears to have arp_filter=1 set. This could lead to communication problems" RETURN_CODE="254" fi + echo "Recommended resolution: Although networking is typically site- and hardware-dependent," + echo " some example configurations for the common dual NIC setup are noted on the WEKA" + echo " documentation site: https://docs.weka.io/planning-and-installation/bare-metal/setting-up-the-hosts#configure-the-ha-networking" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/470_number_of_numa_domains.sh b/install/scripts.d/ta/470_number_of_numa_domains.sh index 7351808..15d1c67 100755 --- a/install/scripts.d/ta/470_number_of_numa_domains.sh +++ b/install/scripts.d/ta/470_number_of_numa_domains.sh @@ -36,6 +36,8 @@ echo -n "Detected $NUMBER_OF_NUMA_DOMAINS NUMA domains - " if [[ $NUMBER_OF_NUMA_DOMAINS -gt $MAXIMUM_NUMA_DOMAINS ]]; then RETURN_CODE=254 echo "Weka currenty only supports a maximum of 32 NUMA domains (4.2.11+)." + echo " Recommended resolution: reduce the number of NUMA domains, perhaps by reducing" + echo " the NUMAs per socket setting in the machine's firmware" # 8 or fewer NUMAs is always supported elif [[ $NUMBER_OF_NUMA_DOMAINS -le 8 ]]; then @@ -45,16 +47,25 @@ elif [[ $NUMBER_OF_NUMA_DOMAINS -le 8 ]]; then elif vergte $WEKA_VERSION "4.3.0" && verlt $WEKA_VERSION "4.3.2" && [[ $NUMBER_OF_NUMA_DOMAINS -gt 16 ]]; then RETURN_CODE=254 echo "Weka only supports more than 16 NUMA domains in 4.3.2 and higher." + echo " Recommended resolutions: either" + echo " . Reduce the number of NUMA domains, perhaps by reducing the NUMAs per socket setting in the machine's firmware" + echo " . Upgrade Weka to a more recent version" # More than 16 NUMAs only supported in 4.2.11+ elif vergt $WEKA_VERSION "4.2.6" && verlt $WEKA_VERSION "4.2.11" && [[ $NUMBER_OF_NUMA_DOMAINS -gt 16 ]]; then RETURN_CODE=254 echo "Weka only supports more than 16 NUMA domains in (4.2.11+, 4.3.2+)." + echo " Recommended resolutions: either" + echo " . Reduce the number of NUMA domains, perhaps by reducing the NUMAs per socket setting in the machine's firmware" + echo " . Upgrade Weka to a more recent version" # More than 8 NUMAs only supported in 4.2.7+ elif verlt $WEKA_VERSION "4.2.7" && [[ $NUMBER_OF_NUMA_DOMAINS -gt 8 ]]; then RETURN_CODE=254 echo "Weka only supports more than 8 NUMA domains in 4.2.7 and higher." + echo " Recommended resolutions: either" + echo " . Reduce the number of NUMA domains, perhaps by reducing the NUMAs per socket setting in the machine's firmware" + echo " . Upgrade Weka to a more recent version" else echo "Number of NUMA domains is within supported limits." fi diff --git a/install/scripts.d/ta/480_check_weka_agent.sh b/install/scripts.d/ta/480_check_weka_agent.sh index b73b24f..2531b45 100755 --- a/install/scripts.d/ta/480_check_weka_agent.sh +++ b/install/scripts.d/ta/480_check_weka_agent.sh @@ -17,11 +17,15 @@ if [[ $? -ne "0" ]] ; then RETURN_CODE=254 echo "The service weka-agent is not reported as enabled by systemd" echo "This may cause weka to fail to start" + echo " Recommended Resolution: enable the service with systemctl enable weka-agent" - if [[ ! -L /etc/init.d ]]; then echo "/etc/init.d is expected to be a symlink to /etc/rc.d/init.d" echo "Without this systemd is unable to find and thus start the weka-agent sysV init script" + echo " Recommended Resolution: on RHEL-based OSes move any scripts to /etc/rc.d/init.d, remove" + echo " the /etc/init.d directory, and re-create it as a link. The following commands are" + echo " one way to achieve this" + echo " mv /etc/init.d/* /etc/rc.d/init.d/ && rmdir /etc/init.d && ln -s /etc/rc.d/init.d /etc/init.d" fi fi diff --git a/install/scripts.d/ta/490_ip_route_metrics.sh b/install/scripts.d/ta/490_ip_route_metrics.sh index de95023..b1e6ebe 100755 --- a/install/scripts.d/ta/490_ip_route_metrics.sh +++ b/install/scripts.d/ta/490_ip_route_metrics.sh @@ -46,6 +46,8 @@ if [[ ${NUMBER_OF_OVERLAPPING_ROUTES_WITH_METRICS} -gt "1" ]]; then echo "that these entries will negatively affect the performance of e.g. floating IP" echo "addresses. In any case it is unlikely that preferential IP routes are of" echo "benefit in a high-performance local network" + echo "Recommended Resolution: review the output of \"ip route\" and rationalize the routes," + echo " likely by removing or coalescing the overlapping routes into larger ranges" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/500_sysctl_rp_filter.sh b/install/scripts.d/ta/500_sysctl_rp_filter.sh index 1cdf643..6229554 100755 --- a/install/scripts.d/ta/500_sysctl_rp_filter.sh +++ b/install/scripts.d/ta/500_sysctl_rp_filter.sh @@ -35,16 +35,18 @@ if [[ $RP_FILTER_VALUE_ALL != "2" ]]; then echo "The value for net.ipv4.conf.${INTERFACE}.rp_filter is set to ${RP_FILTER_VALUE}." echo "This can disrupt floating IP addresses for protocols." echo "It is recommended to set net.ipv4.conf.${INTERFACE}.rp_filter to 2." + echo "Recommended resolution: set this value in e.g. /etc/sysctl.d/99-weka-nics.conf" elif [[ $RP_FILTER_VALUE_ALL == "1" && $RP_FILTER_VALUE == "0" ]]; then RETURN_CODE="254" echo "The value for net.ipv4.conf.${INTERFACE}.rp_filter is set to ${RP_FILTER_VALUE}." echo "The value for net.ipv4.conf.all.rp_filter is set to ${RP_FILTER_VALUE_ALL} and takes precedence." echo "This can disrupt floating IP addresses for protocols." echo "It is recommended to set net.ipv4.conf.${INTERFACE}.rp_filter or net.ipv4.conf.all.rp_filter to 2." + echo "Recommended resolution: set this value in e.g. /etc/sysctl.d/99-weka-nics.conf" fi done else echo "net.ipv4.conf.all.rp_filter is set to 2, no further testing necessary." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/510_check_for_noprefixroute.sh b/install/scripts.d/ta/510_check_for_noprefixroute.sh index 25973ee..eae89a8 100755 --- a/install/scripts.d/ta/510_check_for_noprefixroute.sh +++ b/install/scripts.d/ta/510_check_for_noprefixroute.sh @@ -28,7 +28,10 @@ if [[ "${NOPREFIXROUTE_COUNT}" != "0" ]]; then echo "Certain IP addresses are configured with noprefixroute. This will inhibit the ability" echo "of certain cluster floating ips to accurately determine which link should be preferred" echo "The command \"ip -o -f inet route list match xxx.xxx.xxx.xxx/32 scope link\" needs to" - echo "Be able to return a device for each floating IP configured" + echo "be able to return a device for each floating IP configured" + echo "Recommended Resolution: remove the noprefixroute flag or otherwise ensure the" + echo " ip route list command given above can resolve the link on which you wish the" + echo " floating IP to be configured" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/520_bucket_and_process_uptime.sh b/install/scripts.d/ta/520_bucket_and_process_uptime.sh index e44968f..71d72ae 100755 --- a/install/scripts.d/ta/520_bucket_and_process_uptime.sh +++ b/install/scripts.d/ta/520_bucket_and_process_uptime.sh @@ -36,12 +36,20 @@ CURRENT_TIME_EPOCH=$( date +%s) if [[ $((${CURRENT_TIME_EPOCH}-${MOST_RECENT_BUCKET_STARTTIME_EPOCH})) -lt 3600 ]]; then RETURN_CODE="254" echo "Weka buckets have been restarted within the last hour, or have never started. This may not be a problem on a new cluster" - echo "but could be indicative of problems (e.g. network flapping" + echo "but could be indicative of problems (e.g. network flapping)" + echo "Recommended Resolutions:" + echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" + echo " . Otherwise the most likely cause is network problems, such as link flapping or congestion." + echo " . Review hardware and network stability, then contact customer success" fi if [[ $((${CURRENT_TIME_EPOCH}-${MOST_RECENT_PROCESS_STARTTIME_EPOCH})) -lt 3600 ]]; then RETURN_CODE="254" echo "Weka processes have been restarted within the last hour, or have never started. This may not be a problem on a new cluster" echo "but could be indicative of problems (e.g. network flapping" + echo "Recommended Resolutions:" + echo " . If this is a new cluster, or hosts have been upgraded/reboot, this is likely expected" + echo " . Otherwise the most likely cause is network problems, such as link flapping or congestion." + echo " . Review hardware and network stability, then contact customer success" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/530_high_drive_read_ssd_ratio.sh b/install/scripts.d/ta/530_high_drive_read_ssd_ratio.sh index 1416142..4310f22 100755 --- a/install/scripts.d/ta/530_high_drive_read_ssd_ratio.sh +++ b/install/scripts.d/ta/530_high_drive_read_ssd_ratio.sh @@ -34,6 +34,9 @@ if [[ ${HIGHER_THAN_EXPECTED} == "YES" ]]; then echo "The ratio of NVMe read requests vs DRIVE node read operations is higher than expected over the last ${TIME_TO_EXAMINE}" echo "This could indicate a number of things, such as splitting of read requests or perhaps read amplification" echo "Review ${JIRA_REFERENCE} for details" + echo "Recommended Resolutions:" + echo " . This may be expected behavior for your workload" + echo " . The data may be read using much larger blocksizes than those in which it was written, and matching those may help" fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/550_iptables_nats_local_traffic.sh b/install/scripts.d/ta/550_iptables_nats_local_traffic.sh index 4ca97ca..b228339 100755 --- a/install/scripts.d/ta/550_iptables_nats_local_traffic.sh +++ b/install/scripts.d/ta/550_iptables_nats_local_traffic.sh @@ -21,6 +21,7 @@ for IP_ADDRESS in $(hostname --all-ip-addresses) ; do if [[ $? -eq 0 ]] ; then echo "Warning: it is possible that traffic to or from local IP address ${IP_ADDRESS} will be subject to NAT" echo "This can cause intra-WEKA communication errors" + echo "Recommended Resolution: Do not NAT WEKA traffic" RETURN_CODE="254" fi done @@ -29,6 +30,7 @@ for IP_ROUTE in $(ip -4 --json route list | python3 -c 'import sys, json, colle if [[ $? -eq 0 ]] ; then echo "Warning: it is possible that traffic to or from subnet ${IP_ROUTE} will be subject to NAT" echo "This can cause intra-WEKA communication errors" + echo "Recommended Resolution: Do not NAT WEKA traffic" RETURN_CODE="254" fi done diff --git a/install/scripts.d/ta/560_check_for_swap.sh b/install/scripts.d/ta/560_check_for_swap.sh index e3de53d..123f4a4 100755 --- a/install/scripts.d/ta/560_check_for_swap.sh +++ b/install/scripts.d/ta/560_check_for_swap.sh @@ -14,6 +14,7 @@ SWAPTOTAL=$(grep SwapTotal /proc/meminfo | awk '{print $2}') if [[ ${SWAPTOTAL} -ne "0" ]] ; then echo "This host has swap configured - this is unlikely to be" echo "helpful in a large memory system" + echo "Recommended Resolution: if the host has enough RAM, disable swap with swapoff then disable swap at boot time (likely in /etc/fstab)" RETURN_CODE="254" fi diff --git a/install/scripts.d/ta/570_does_weka_use_swap.sh b/install/scripts.d/ta/570_does_weka_use_swap.sh index 5543182..2d83ab5 100755 --- a/install/scripts.d/ta/570_does_weka_use_swap.sh +++ b/install/scripts.d/ta/570_does_weka_use_swap.sh @@ -17,6 +17,10 @@ for WEKAPID in $(ps -eo pid,comm | grep weka_init | awk '{print $1}') ; do if [[ ${NUM_PROCS_USING_SWAP} -gt "0" ]] ; then echo "There are Weka processes using swap - this is likely to be" echo "detrimental to performance" + echo "Recommended Resolutions:" + echo " . Add more RAM if the host is truly constrained" + echo " . Review if the host has not correctly released RAM" + echo " . Reduce the amount of RAM allocated to WEKA (a last resort)" RETURN_CODE="254" fi done diff --git a/install/scripts.d/ta/580_weka_version_available_everywhere.sh b/install/scripts.d/ta/580_weka_version_available_everywhere.sh index aca379f..fc6313d 100755 --- a/install/scripts.d/ta/580_weka_version_available_everywhere.sh +++ b/install/scripts.d/ta/580_weka_version_available_everywhere.sh @@ -27,6 +27,8 @@ CURRENT_AGENT_VERSION=$(weka local status | awk 'NR==1{print $5}' | tr -d ')') if [[ ${WEKA_CLUSTER_VERSION} != ${CURRENT_AGENT_VERSION} ]] ; then echo "The currently running cluster version ${WEKA_CLUSTER_VERSION} does not match the" echo "default installed local agent version ${CURRENT_AGENT_VERSION}" + echo "Recommended Resolution: update this host to the cluster version, either by" + echo " unmounting and re-mounting filesystems or using the weka local upgrade utility" RETURN_CODE="254" fi diff --git a/install/scripts.d/ta/590_single_dns_entry.sh b/install/scripts.d/ta/590_single_dns_entry.sh index ecb4c61..a0d6303 100755 --- a/install/scripts.d/ta/590_single_dns_entry.sh +++ b/install/scripts.d/ta/590_single_dns_entry.sh @@ -43,6 +43,7 @@ fi if [[ ${NUMBER_OF_A_RECORDS} != "1" ]] ; then echo "There are ${NUMBER_OF_A_RECORDS} A records in DNS for ${HOSTNAME}" echo "This is very likely to cause problems with (at least) SMB-W clustering" + echo "Recommended Resolution: add a DNS record of type A for ${HOSTNAME} pointing to the IPv4 address" RETURN_CODE=254 else echo "There is exactly one A record in DNS for ${HOSTNAME}" diff --git a/install/scripts.d/ta/610_nfs_aliases_sbr.sh b/install/scripts.d/ta/610_nfs_aliases_sbr.sh index a834366..6a36076 100644 --- a/install/scripts.d/ta/610_nfs_aliases_sbr.sh +++ b/install/scripts.d/ta/610_nfs_aliases_sbr.sh @@ -71,6 +71,8 @@ main() { done < <(ip -4 rule | awk '{print $3}' | grep -v "all") if [[ $found_rule -eq 0 ]]; then echo "WARNING: No ip rule for address $NFS_IP! It is possible source-based routing should be configured." + echo "Recommended Resolution: configure source-based routing. Examples are mentioned in the WEKA docs:" + echo "https://docs.weka.io/planning-and-installation/bare-metal/setting-up-the-hosts#configure-the-ha-networking" RETURN_CODE=254 fi done < <(weka nfs interface-group assignment --no-header | awk '$3 == '$weka_host_id'' | awk '{print $1}') diff --git a/install/scripts.d/ta/620_same_mtu_across_nics.sh b/install/scripts.d/ta/620_same_mtu_across_nics.sh index e26a891..d08c1db 100644 --- a/install/scripts.d/ta/620_same_mtu_across_nics.sh +++ b/install/scripts.d/ta/620_same_mtu_across_nics.sh @@ -33,6 +33,10 @@ for CONTAINER in $(weka local ps --no-header | awk '{print $1}' | grep -vw -e en echo "has an MTU of ${MTU}, which is less than the MTU ${SMALLEST_MTU_REQUIRED} seen elsewhere in this host" echo "This can lead to cluster communication problems" echo "Please see ${JIRA_REFERENCE} for more information" + echo "Recommended Resolution: Increase the MTUs of all NICs in the cluster to at least ${SMALLEST_MTU_REQUIRED}" + echo "Review your OS documentation for how to set this permanently, but NetworkManager-based OSes will use" + echo "something like \"nmcli connection modify eno1 802-3-ethernet.mtu ${SMALLEST_MTU_REQUIRED}\" and then" + echo "\"nmcli connection apply eno1\", but connection names will vary" RETURN_CODE=254 fi done diff --git a/install/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh b/install/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh index f00e76c..f05ba7c 100644 --- a/install/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh +++ b/install/scripts.d/ta/630_opt_weka_exists_but_not_mounted.sh @@ -29,6 +29,11 @@ main() { echo echo "This means that changes made to the live system as it is now" echo "are unlikely to be present on the system post-reboot" + echo "Recommended Resolution:" + echo " . Do NOT reboot the host" + echo " . You need to verify that the on-boot configuration for /opt/weka" + echo " matches and uses the currently used layout. This may involve" + echo " editing filesystem layouts in /etc/fstab or systemd" RETURN_CODE=254 else echo "No immediate directory/mount overlaps found" diff --git a/install/scripts.d/ta/640_opt_weka_is_not_symlink.sh b/install/scripts.d/ta/640_opt_weka_is_not_symlink.sh index 5605ce3..9f50af6 100644 --- a/install/scripts.d/ta/640_opt_weka_is_not_symlink.sh +++ b/install/scripts.d/ta/640_opt_weka_is_not_symlink.sh @@ -13,6 +13,9 @@ main() { if [[ -L /opt/weka ]] ; then echo "/opt/weka is a symlink. This is not supported and" echo "is very unlikely to work due to chroot-style container behaviour" + echo "Recommended Resolution: Do not install Weka in a symlink" + echo "Resolving this can involve a rolling deactivation and re-installation" + echo "of Weka, depending on how and why this was done" RETURN_CODE=254 else echo "/opt/weka is not a symlink. This is ok" diff --git a/install/scripts.d/ta/650_firewall_check_quick.sh b/install/scripts.d/ta/650_firewall_check_quick.sh index 0cbd15a..fbe30b4 100644 --- a/install/scripts.d/ta/650_firewall_check_quick.sh +++ b/install/scripts.d/ta/650_firewall_check_quick.sh @@ -9,62 +9,65 @@ WTA_REFERENCE="" KB_REFERENCE="" RETURN_CODE=0 -# Last modified: 2024-09-23 +# Last modified: 2024-11-05 # Assumption / limitations -# Queries weka local status for valid list of backend IPs +# Must permit ICMP (ping) # Only performs TCP pings against the management ports (base_port + 0) -# Assumes weka local status output structure is static +declare -A BACKEND_MGMT_PORTS declare -A BACKEND_IPS -curr_ip="" -curr_ips=() +# Check if we can run weka commands +weka status &> /dev/null +if [[ $? -ne 0 ]]; then + echo "ERROR: Not able to run weka commands" + exit 254 +elif [[ $? -eq 127 ]]; then + echo "WEKA not found" + exit 254 +elif [[ $? -eq 41 ]]; then + echo "Unable to login into Weka cluster." + exit 254 +fi -# Determine what "base" ports each backend is using -while read line; do - if [[ $line =~ ^"ip: "(.*) ]]; then - curr_ip=${BASH_REMATCH[1]} - curr_ips+=($curr_ip) - elif [[ $line =~ ^"port: "(.*) ]]; then - port=${BASH_REMATCH[1]} - if [[ -z ${BACKEND_IPS[$curr_ip]+set} ]]; then - BACKEND_IPS[$curr_ip]="$port:" - elif [[ ! ${BACKEND_IPS[$curr_ip]} =~ "$port:" ]]; then # Only add if not there - BACKEND_IPS[$ip]="${BACKEND_IPS[$ip]}$port:" - fi - elif [[ $line =~ ^"base_port: "(.*) ]]; then - base_port=${BASH_REMATCH[1]} - for ip in ${curr_ips[@]}; do - if [[ ! ${BACKEND_IPS[$ip]} =~ "$base_port:" ]]; then # Only add if not there - BACKEND_IPS[$ip]="${BACKEND_IPS[$ip]}$base_port:" - fi - done - curr_ips=() - fi -done < <(weka local status -J 2>/dev/null | grep -w -e "ip\":" -e "port\":" -e "base_port\":" | tr -d '",') +while read CONTAINER_ID; do + curr_ips=() + BACKEND_MGMT_PORTS[${CONTAINER_ID}]=$(weka cluster container ${CONTAINER_ID} -J | grep mgmt_port | grep -o "[0-9]\+") + + for IP in $(weka cluster container ${CONTAINER_ID} -o ips --no-header | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+"); do + curr_ips="${curr_ips} ${IP}" + done + BACKEND_IPS[${CONTAINER_ID}]=${curr_ips} +done < <(weka cluster container -b -o id --no-header) # Perform the port checks -for ip in ${!BACKEND_IPS[@]}; do - # If it does not respond to a ping, within 250ms, - # assume the IP is not valid / reachable. - if (ping -c 1 -q -W 250 $ip &>/dev/null); then - IFS=':' read -r -a ports <<< "${BACKEND_IPS[$ip]}" - for port in ${ports[@]}; do +for CONTAINER_ID in ${!BACKEND_IPS[@]}; do + curr_ips=${BACKEND_IPS[$CONTAINER_ID]} + port=${BACKEND_MGMT_PORTS[$CONTAINER_ID]} + + for ip in ${curr_ips[@]}; do + # If it does not respond to a ping, within 250ms, + # assume the IP is not valid / reachable. + if (ping -c 1 -q -W 250 $ip &>/dev/null); then if (! echo -n 2>/dev/null < /dev/tcp/$ip/$port); then echo "WARN: Unable to connect to $ip tcp/$port" + echo "Recommended Resolution: There is likely something blocking network communication between" + echo "this host and ${ip} tcp/${port}. Please review network connectivity and/or firewalls" + echo "In particular DDOS-style protection on switches may prevent communication" RETURN_CODE=254 fi - done - else - echo "WARN: Unable to ping $ip" - RETURN_CODE=254 - fi + + else + echo "WARN: Unable to ping $ip" + RETURN_CODE=254 + fi + done done if [[ ${RETURN_CODE} -eq 0 ]]; then echo "No backend management ports blocked." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/660_hugepages_check.sh b/install/scripts.d/ta/660_hugepages_check.sh index 1e49e48..96443d6 100644 --- a/install/scripts.d/ta/660_hugepages_check.sh +++ b/install/scripts.d/ta/660_hugepages_check.sh @@ -28,6 +28,8 @@ if [[ -n $WEKA_HUGE_1G ]]; then if [[ $DIFF_1G != 0 ]]; then RETURN_CODE=254 echo "Discrepancy of $DIFF_1G 1GiB hugepage(s) between Weka and OS." + echo "Recommended Resolution: Review if other applications (such as hypervisors) are" + echo "using hugepages. If they are, this may be expected." fi fi @@ -37,6 +39,8 @@ if [[ -n $WEKA_HUGE_2M ]]; then if [[ $DIFF_2M != 0 ]]; then RETURN_CODE=254 echo "Discrepancy of $DIFF_2M 2MiB hugepage(s) between Weka and OS." + echo "Recommended Resolution: Review if other applications (such as hypervisors) are" + echo "using hugepages. If they are, this may be expected." fi fi @@ -45,4 +49,4 @@ if [[ $RETURN_CODE -eq 0 ]]; then echo "No hugepages allocation discrepancy." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/670_crowdstrike_check.sh b/install/scripts.d/ta/670_crowdstrike_check.sh index dcab6ab..31497f1 100644 --- a/install/scripts.d/ta/670_crowdstrike_check.sh +++ b/install/scripts.d/ta/670_crowdstrike_check.sh @@ -14,9 +14,13 @@ RETURN_CODE=0 if systemctl status falcon-sensor &> /dev/null; then echo "Warning: CrowdStrike Falcon Sensor is running" + echo "Recommended Resolution: we do not recommend using this software in conjunction with WEKA as" + echo "it has been shown to cause problems unloading kernel modules" exit 254 elif lsmod | grep -q -m 1 falcon_lsm; then echo "Warning: Crowdstrike Falcon kernel module loaded" + echo "Recommended Resolution: we do not recommend using this software in conjunction with WEKA as" + echo "it has been shown to cause problems unloading kernel modules" exit 254 fi echo "CrowdStrike Falcon Sensor is not running" diff --git a/install/scripts.d/ta/670_nm_ignore_carrier.sh b/install/scripts.d/ta/670_nm_ignore_carrier.sh index 8431e97..ca831f2 100644 --- a/install/scripts.d/ta/670_nm_ignore_carrier.sh +++ b/install/scripts.d/ta/670_nm_ignore_carrier.sh @@ -20,6 +20,9 @@ if nmcli -v &> /dev/null; then elif [[ "$IGNORE_CARRIER" != "*" ]]; then RETURN_CODE=254 echo "NetworkManager ignore-carrier is set to ${IGNORE_CARRIER}, but recommended value is ignore-carrier=*" + echo "Recommended Resolution: set ignore-carrier=* in NetworkManager, perhaps with the following commands" + echo " echo -e '[main]\\nignore-carrier=*' > /etc/NetworkManager/conf.d/99-carrier.conf " + echo " systemctl restart NetworkManager " else echo "NetworkManager ignore-carrier=* exists." fi @@ -30,4 +33,4 @@ else echo "NetworkManager not in use." fi -exit ${RETURN_CODE} \ No newline at end of file +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/680_redundant_weka_overrides.sh b/install/scripts.d/ta/680_redundant_weka_overrides.sh index 942303f..3c31865 100644 --- a/install/scripts.d/ta/680_redundant_weka_overrides.sh +++ b/install/scripts.d/ta/680_redundant_weka_overrides.sh @@ -51,6 +51,8 @@ while read CURRENT_OVERRIDE; do REDUNDANT_FROM_VERSION=${REDUNDANT_OVERRIDE_LIST[${CURRENT_OVERRIDE}]} if verlte ${REDUNDANT_FROM_VERSION} ${CURRENT_WEKA_VERSION} ; then echo "Override ${CURRENT_OVERRIDE} is no longer necessary as of v${REDUNDANT_FROM_VERSION}" + echo "Recommended Resolution: Contact customer success and query if this override can" + echo "be disabled and subsequently removed" RETURN_CODE=254 fi done < <(weka debug override list --output key --no-header) diff --git a/install/scripts.d/ta/690_auto_core_in_mcb.sh b/install/scripts.d/ta/690_auto_core_in_mcb.sh index 049ed8f..3636f17 100644 --- a/install/scripts.d/ta/690_auto_core_in_mcb.sh +++ b/install/scripts.d/ta/690_auto_core_in_mcb.sh @@ -14,6 +14,14 @@ for WEKA_CONTAINER in $(weka local ps --output name --no-header | grep -E '(driv MATCHES=$(weka local resources -C ${WEKA_CONTAINER} | grep -cE '^(DRIVES|COMPUTE|FRONTEND) *[0-9].*auto') if [[ ${MATCHES} -ne 0 ]] ; then echo "Host ${HOSTNAME} has auto-core allocation in MCB container ${WEKA_CONTAINER}" + echo "Recommended Resolution: reconfigure the local resources to use a fixed CPU core, such as" + if [[ ${WEKA_CONTAINER} =~ "drive" ]] ; then + echo "weka local resources cores --container ${WEKA_CONTAINER} --only-drives-cores --core-ids X,Y,Z" + elif [[ ${WEKA_CONTAINER} =~ "compute" ]] ; then + echo "weka local resources cores --container ${WEKA_CONTAINER} --only-compute-cores --core-ids X,Y,Z" + elif [[ ${WEKA_CONTAINER} =~ "frontend" ]] ; then + echo "weka local resources cores --container ${WEKA_CONTAINER} --only-frontend-cores --core-ids X,Y,Z" + fi exit 254 fi done diff --git a/install/scripts.d/ta/700_wekapp351707.sh b/install/scripts.d/ta/700_wekapp351707.sh index fb656cf..7cc8604 100644 --- a/install/scripts.d/ta/700_wekapp351707.sh +++ b/install/scripts.d/ta/700_wekapp351707.sh @@ -39,6 +39,7 @@ if [[ $WEKA_VERSION = "4.2.7.64" || $WEKA_VERSION = "4.2.8.66" ]]; then echo "SSD metadata exceeds more than half of available SSD space on one or more filesystems." echo "Possibly vulnerable to WEKAPP-351707." echo "Consider adding the fs_backpressure_skip_ssdwritecache_estimation_all override." + echo "Recommended resolution: upgrade to a version beyond 4.2.9.x" fi fi done < <(weka fs -R --no-header -o availableSSD,usedSSDM,stores | sed -e 's/B//g' | awk '{print $1, $2, $3}') diff --git a/install/scripts.d/ta/710_no_spaces_in_cluster_name.sh b/install/scripts.d/ta/710_no_spaces_in_cluster_name.sh index 349e013..2a209f8 100644 --- a/install/scripts.d/ta/710_no_spaces_in_cluster_name.sh +++ b/install/scripts.d/ta/710_no_spaces_in_cluster_name.sh @@ -27,6 +27,9 @@ WEKA_CLUSTER_NAME=$(weka status | grep cluster: | sed -e 's/^ *cluster: *//' -e if [[ ${WEKA_CLUSTER_NAME} = *" "* ]]; then echo "Weka cluster name contains spaces" echo "This will prevent an S3 cluster from starting - see KB ${KB_REFERENCE}" + NEW_RECOMMENDED_NAME=$(echo ${WEKA_CLUSTER_NAME} | sed 's/ /_/g') + echo "Recommended resolution: update the cluster name, e.g. using:" + echo " weka cluster update --cluster-name ${NEW_RECOMMENDED_NAME}" RETURN_CODE=254 else echo "Weka cluster name does not contain spaces" diff --git a/install/scripts.d/ta/720_low_compute_ram_to_ssd.sh b/install/scripts.d/ta/720_low_compute_ram_to_ssd.sh index f14a276..4ffa80f 100644 --- a/install/scripts.d/ta/720_low_compute_ram_to_ssd.sh +++ b/install/scripts.d/ta/720_low_compute_ram_to_ssd.sh @@ -30,6 +30,11 @@ RAM_TO_SSD_RATIO=$(echo "${WEKA_SSD_CAPACITY}/${WEKA_COMPUTE_RAM}" | bc) if [[ ${RAM_TO_SSD_RATIO} -gt 4000 ]]; then echo "Warning: there is more than 4000 times the RAM capacity in total NVME capacity" echo "This may lead to Weka bucket startup issues. Refer to ${JIRA_REFERENCE}" + echo "Recommended Resolution: add more memory to cluster - options include:" + echo " . Increasing the amount of memory allocated to COMPUTE processes if there's spare" + echo " . Increasing the amount RAM installed, then doing the above" + echo " . Scaling out by adding more hosts" + echo " . Reducing the size of the NVME by removing drives or tiering to Object Store" RETURN_CODE=254 else echo "RAM to SSD ratio is acceptable" diff --git a/install/scripts.d/ta/730_large_drives.sh b/install/scripts.d/ta/730_large_drives.sh index e054d73..ef5f4e5 100755 --- a/install/scripts.d/ta/730_large_drives.sh +++ b/install/scripts.d/ta/730_large_drives.sh @@ -43,7 +43,7 @@ if verlt ${WEKA_VERSION} "4.1.2" && [[ ${LARGEST_SSD} -gt ${LARGEST_SUPPORTED_SS RETURN_CODE=254 echo "Weka only supports SSDs larger than ${LARGEST_SUPPORTED_SSD} in versions after 4.1.2" echo "Refer to ${KB_REFERENCE} or ${JIRA_REFERENCE} for more information" - + echo "Recommended Resolution: upgrade to our latest LTS release" else echo "No SSDs are beyond supported capacities" fi diff --git a/install/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh b/install/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh index 3e43a31..8182fa3 100644 --- a/install/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh +++ b/install/scripts.d/ta/740_ensure_cgroups_v1_with_protocols.sh @@ -31,6 +31,9 @@ else for CONTAINER in $(weka local ps --no-header | awk '{print $1}' | grep -w -e ganesha -e smbw -e s3) ; do RETURN_CODE=254 echo "Protocol container ${CONTAINER} is not yet compatible with cgroup mode ${CURRENT_CGROUP_MODE}" + echo "Recommended Resolution: reboot the host with cgroup v1 enabled, likely by adding" + echo "\"systemd.unified_cgroup_hierarchy=false\" to e.g. /etc/default/grub's DEFAULT line and" + echo "running \"update-grub\" (OS-dependent)" done fi diff --git a/install/scripts.d/ta/740_mlx_settings.sh b/install/scripts.d/ta/740_mlx_settings.sh index 36c9706..d9c416d 100644 --- a/install/scripts.d/ta/740_mlx_settings.sh +++ b/install/scripts.d/ta/740_mlx_settings.sh @@ -65,6 +65,10 @@ fi if [[ $RETURN_CODE -eq 0 ]]; then echo "Mellanox NIC settings correctly set." +else + echo "Mellanox NIC settings are not as recommended. Recommended Resolution:" + echo 'for dev in $(ls /sys/class/infiniband/); do sudo mlxconfig -y -d ${dev} set ADVANCED_PCI_SETTINGS=1 PCI_WR_ORDERING=1 ; done' + echo "Followed by rebooting this host, one at a time" fi -exit $RETURN_CODE \ No newline at end of file +exit $RETURN_CODE diff --git a/install/scripts.d/ta/755_wekapp424920_smbw_mask.sh b/install/scripts.d/ta/755_wekapp424920_smbw_mask.sh index 2d3ef71..233bd3e 100644 --- a/install/scripts.d/ta/755_wekapp424920_smbw_mask.sh +++ b/install/scripts.d/ta/755_wekapp424920_smbw_mask.sh @@ -34,11 +34,15 @@ if weka smb cluster | awk '/Type:/ && /smbw/' &> /dev/null; then if [[ $NUM_SHARES -ne $NUM_FILE_MASKS ]]; then echo "WARN: there are $NUM_SHARES smbw shares, but only $NUM_FILE_MASKS shares with force_create_mode" + echo "Recommended Resolution: for each share, delete and re-create it to ensure this mode is set." + echo " WARNING: this will likely be service-affecting" RETURN_CODE=254 fi if [[ $NUM_SHARES -ne $NUM_DIR_MASKS ]]; then echo "WARN: there are $NUM_SHARES smbw shares, but only $NUM_DIR_MASKS shares with force_directory_mode" + echo "Recommended Resolution: for each share, delete and re-create it to ensure this mode is set." + echo " WARNING: this will likely be service-affecting" RETURN_CODE=254 fi else @@ -55,4 +59,4 @@ if [[ $RETURN_CODE -eq 0 ]]; then echo "Not vulnerable to WEKAPP-424920 - smbw shares properly defined" fi -exit $RETURN_CODE \ No newline at end of file +exit $RETURN_CODE diff --git a/install/scripts.d/ta/765_process_network_mode.sh b/install/scripts.d/ta/765_process_network_mode.sh index f269a65..43d5d54 100644 --- a/install/scripts.d/ta/765_process_network_mode.sh +++ b/install/scripts.d/ta/765_process_network_mode.sh @@ -30,6 +30,7 @@ for ROLE in COMPUTE DRIVES; do if [[ $(weka cluster process -F role=${ROLE} -o netmode --no-header | sort | uniq | wc -l) -gt 1 ]]; then RETURN_CODE=254 echo "WARNING: $ROLE process network modes are inconsistent" + echo "Recommended Resolution: contact Customer Success to ensure that each container is defined correctly" fi done @@ -38,4 +39,4 @@ if [[ $RETURN_CODE -eq 0 ]]; then echo "Backend process network modes are consistent." fi -exit $RETURN_CODE \ No newline at end of file +exit $RETURN_CODE diff --git a/install/scripts.d/ta/775_dup_arp_check.sh b/install/scripts.d/ta/775_dup_arp_check.sh index 71d0a0f..4bf2763 100644 --- a/install/scripts.d/ta/775_dup_arp_check.sh +++ b/install/scripts.d/ta/775_dup_arp_check.sh @@ -31,6 +31,7 @@ fi for MGMT_IP in $(weka cluster container net -o ips --no-header | tr ',' '\n' | tr -d " " | sort -u); do if [[ $(ip -br neigh | grep ${MGMT_IP} | awk '{print $3}' | sort -u | wc -l) -gt 1 ]]; then echo "WARN: Duplicate arp entry found for IP ${MGMT_IP}" + echo "Recommended Resolution: check for IP clashes, and that there is a 1:1 mapping for IP:MACs" RETURN_CODE=254 fi done diff --git a/install/scripts.d/ta/785_asymmetric_mtu.sh b/install/scripts.d/ta/785_asymmetric_mtu.sh index b783a1e..6792d51 100644 --- a/install/scripts.d/ta/785_asymmetric_mtu.sh +++ b/install/scripts.d/ta/785_asymmetric_mtu.sh @@ -15,6 +15,10 @@ for INDIVIDUAL_DRIVE_PROCESS in $(weka cluster process --backends --filter role= if [[ $(weka debug net peers --no-header ${INDIVIDUAL_DRIVE_PROCESS} --output inMTU,outMTU | awk '{if($1 != $2) {print "yes"}}') == "yes" ]]; then host=$(weka cluster process ${INDIVIDUAL_DRIVE_PROCESS} --no-header -o hostname) echo "WARN: Asymmetric MTU detected for at least one peer of ${host}, process id ${INDIVIDUAL_DRIVE_PROCESS}" + echo "Recommended Resolution: The usual cause for this is assymetric routing, with different MTUs configured" + echo "along the two different paths. Run a tracepath/traceroute from each end of the backend<->client" + echo "connection, and determine if routes take different paths. It's likely that different paths will have" + echo "different pMTUs, and every intervening link on the path with the smaller MTU should be checked" RETURN_CODE=254 fi done diff --git a/install/scripts.d/ta/790_raft_agents.sh b/install/scripts.d/ta/790_raft_agents.sh index 2a1b5dd..fce15ee 100755 --- a/install/scripts.d/ta/790_raft_agents.sh +++ b/install/scripts.d/ta/790_raft_agents.sh @@ -30,6 +30,7 @@ WEKA_MAX_RAFT_AGENTS=$((${WEKA_COMPUTE_PROCESS_COUNT}*180)) if [[ ${WEKA_RAFT_AGENTS} -gt ${WEKA_MAX_RAFT_AGENTS} ]] ; then echo "The maximum number of raft agents recommended per compute node is 180. This cluster requires ${WEKA_RAFT_AGENTS} in total" + echo "Recommended resolution: scale out your cluster by adding more compute processes or perhaps backend WEKA servers" RETURN_CODE=254 fi if [[ ${RETURN_CODE} -eq 0 ]]; then diff --git a/install/scripts.d/ta/795_netmask_mismatch.sh b/install/scripts.d/ta/795_netmask_mismatch.sh index ebd40ba..3ce0dd4 100644 --- a/install/scripts.d/ta/795_netmask_mismatch.sh +++ b/install/scripts.d/ta/795_netmask_mismatch.sh @@ -32,6 +32,11 @@ done if [[ ${RETURN_CODE} -eq 0 ]]; then echo "All Weka containers have consistent netmasks" +else + echo "Recommended Resolution: determine which of these netmasks is correct, and rectify the one with" + echo "the wrong configuration. If Weka needs re-configuring, this will be done with commands like" + echo " weka local resources --container net remove " + echo " weka local resources --container net add --netmask " fi exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/805_lacp_hash_check.sh b/install/scripts.d/ta/805_lacp_hash_check.sh new file mode 100644 index 0000000..3f83252 --- /dev/null +++ b/install/scripts.d/ta/805_lacp_hash_check.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Hashing on DPDK LACP links is only supported on CX6-DX and higher" +SCRIPT_TYPE="parallel" +JIRA_REFERENCE="WEKAPP-429344" +WTA_REFERENCE="" +RETURN_CODE=0 + +declare -A NET_MODEL +declare -A PCI_BUSES + +BONDED_INTERFACE="" + +# The below is currently unverified: +# In order to enable this feature, set this mode for both bonded devices through +# the below sysfs before the device is in switchdev mode: +# echo "hash" > /sys/class/net/enp8s0f0/compat/devlink/lag_port_select_mode + +# This feature requires to set LAG_RESOURCE_ALLOCATION to 1 with mlxconfig + +# References: +# https://download.lenovo.com/servers/mig/2023/06/12/57746/mlnx-lnvgy_dd_nic_cx.ib-5.9-0.5.6.0-0_rhel8_x86-64.pdf + + +# dmesg may indicate 'devlink op lag_port_select_mode doesn't support hw lag' +# on unsupported models. + +if ! lshw -version &> /dev/null; then + echo "Unable to locate lshw." + exit 0 +fi + +# weka local resources net -C drives0 --stable +# NET DEVICE IDENTIFIER DEFAULT GATEWAY IPS NETMASK NETWORK LABEL +# bond0 0000:08:00.0 + +# lshw -C network -businfo +# Bus info Device Class Description +# ============================================================= +# pci@0000:65:00.0 ens9f0np0 network MT2892 Family [ConnectX-6 Dx] +# pci@0000:65:00.1 ens9f1np1 network MT2892 Family [ConnectX-6 Dx] + + +# Is the cluster using a bonded NIC? +while read CONTAINER; do + while read NET_ENTRY; do + if [[ ${NET_ENTRY} =~ "name:"(.*) ]]; then + NET_NAME=${BASH_REMATCH[1]} + if [[ -f /proc/net/bonding/${NET_NAME} ]]; then + BONDED_INTERFACE=${NET_NAME} + fi + fi + done < <(weka local resources -C ${CONTAINER} net --stable -J | grep -w -e name | tr -d \"\,[:blank:]) +done < <(weka local ps --output name --no-header | grep -vw -e envoy -e ganesha -e samba -e smbw -e s3) + + +if [[ -n ${BONDED_INTERFACE} ]]; then + if [[ $(cat /sys/class/net/${BONDED_INTERFACE}/bonding/xmit_hash_policy) =~ "layer2" ]]; then + echo "WARN: xmit hash policy for ${BONDED_INTERFACE} set to layer2." + fi + + # Look for ConnectX adapters by iterating over each container + while read CONTAINER; do + while read PCI; do + while read LINE; do + if [[ $LINE =~ "pci@"([[:digit:][:punct:]]+)[[:blank:]]+([[:alnum:]]+)[[:blank:]]+"network"[[:blank:]]+(.*) ]]; then + NET=${BASH_REMATCH[2]} + MODEL=${BASH_REMATCH[3]} + PCI_BUSES[$PCI]=${NET} + NET_MODEL[$NET]=${MODEL} + fi + done < <(lshw -C network -businfo -quiet | awk '/'"$PCI"'/ && /ConnectX/{print $0}') + done < <(weka local resources net -C "$CONTAINER" --stable | awk 'NR>1 {print $2}' | sed -e 's/\.[0-9]//g') + done < <(weka local ps --output name --no-header | grep -vw -e envoy -e ganesha -e samba -e smbw -e s3) +else + echo "INFO: NIC bonding not enabled." + exit 0 +fi + + +if [[ ${#PCI_BUSES[@]} -eq 0 ]]; then + echo "INFO: Unable to locate Mellanox NICs." + exit 0 +elif [[ ${#PCI_BUSES[@]} -gt 1 ]]; then + echo "WARN: Potentially bonding across NICs, which is not supported." + RETURN_CODE=254 +else + for NET in "${!NET_MODEL[@]}"; do + if [[ ! ((${NET_MODEL[${NET}]} =~ "ConnectX-6 Dx") || (${NET_MODEL[${NET}]} =~ "ConnectX-7")) ]]; then + echo "WARN: The ${NET} NIC (${NET_MODEL[${NET}]}) may not support hashing on bonded links." + RETURN_CODE=254 + fi + done +fi + + +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "Bonding properly configured." +else + echo "Recommended Resolution: Determine NIC compatibility with the bonding mode selected:" + echo "https://docs.weka.io/planning-and-installation/prerequisites-and-compatibility#networking-ethernet" +fi + +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/810_use_only_readcache_for_protocols.sh b/install/scripts.d/ta/810_use_only_readcache_for_protocols.sh new file mode 100644 index 0000000..577ee8e --- /dev/null +++ b/install/scripts.d/ta/810_use_only_readcache_for_protocols.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Ensure any protocols are using only readcache mounts" +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="parallel" +JIRA_REFERENCE="WEKAPP-444847" + +RETURN_CODE=0 + +for WEKA_CONTAINER in $(sudo weka local ps --output name --no-header | grep -w -e ganesha -e samba -e smbw -e s3); do + MOUNTS_USING_WRITECACHE=$(sudo weka local exec --container ${WEKA_CONTAINER} mount -t wekafs | grep -c writecache) + if [[ ${MOUNTS_USING_WRITECACHE} != "0" ]]; then + echo "WARN: container ${WEKA_CONTAINER} - used for protocols - is using writecache on host ${HOSTNAME}" + echo "Refer to ${JIRA_REFERENCE} for more details" + if [[ ${WEKA_CONTAINER} =~ "s3" ]]; then + echo "Recommended Resolution: for s3, use the following (brief service interruption):" + echo " weka s3 cluster update --mount-options readcache -f" + elif [[ ${WEKA_CONTAINER} =~ "smb" ]]; then + echo "Recommended Resolution: for smb, for each share, delete it and re-add it (service interruption)" + elif [[ ${WEKA_CONTAINER} =~ "ganesha" ]]; then + echo "Recommended Resolution: for NFS, for each share, delete it and re-add it (service interruption)" + fi + + RETURN_CODE=254 + fi +done + +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "No protocols are using writecache" +fi + +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/815_no_spaces_in_fs_name.sh b/install/scripts.d/ta/815_no_spaces_in_fs_name.sh new file mode 100644 index 0000000..a95e7f7 --- /dev/null +++ b/install/scripts.d/ta/815_no_spaces_in_fs_name.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +#set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Verify no spaces in any filesystem name" +SCRIPT_TYPE="single" +JIRA_REFERENCE="" +WTA_REFERENCE="" +KB_REFERENCE="" +RETURN_CODE=0 + +# Check if we can run weka commands +weka status &> /dev/null +if [[ $? -ne 0 ]]; then + echo "ERROR: Not able to run Weka commands." + exit 254 +elif [[ $? -eq 127 ]]; then + echo "Weka not found." + exit 254 +elif [[ $? -eq 41 ]]; then + echo "Unable to login into Weka cluster." + exit 254 +fi + +while read -r WEKA_FS_NAME ; do + if [[ ${WEKA_FS_NAME} = *" "* ]]; then + echo "Filesystem \"${WEKA_FS_NAME}\" contains spaces" + echo "This can prevent S3 buckets from being created" + NEW_RECOMMENDED_NAME=$(echo ${WEKA_FS_NAME} | sed 's/ /_/g') + echo "Recommended resolution: update the cluster name, e.g. using:" + echo " weka fs update \"${WEKA_FS_NAME}\" --new-name ${NEW_RECOMMENDED_NAME}" + RETURN_CODE=254 + fi +done < <(weka fs --no-header --output name) + +if [[ $RETURN_CODE -eq 0 ]] ; then + echo "No filesystem found with spaces in the name" + exit 0 +fi + +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/825_ha_mgmt_ip.sh b/install/scripts.d/ta/825_ha_mgmt_ip.sh new file mode 100644 index 0000000..63d0102 --- /dev/null +++ b/install/scripts.d/ta/825_ha_mgmt_ip.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Check if each Weka dataplane NIC has a corresponding, valid, management IP" +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="parallel" + +RETURN_CODE=0 + +for WEKA_CONTAINER in $(sudo weka local ps --output name --no-header | grep -e compute -e drives -e frontend); do + NET_NAME="" + NET_COUNT=0 + SINGLE_IP=0 + + while read NET_ENTRY; do + if [[ ${NET_ENTRY} =~ "name:"(.*) ]]; then + NET_NAME=${BASH_REMATCH[1]} + fi + if [[ ${NET_ENTRY} =~ "ips:[]" ]]; then + SINGLE_IP=1 + fi + + if [[ -n ${NET_NAME} ]]; then + NET_COUNT=$((NET_COUNT+1)) + + if [[ $(ip -4 -j -o addr show dev ${NET_NAME} 2>/dev/null | tr -d \"\[:blank:]) =~ "local:"([0-9\.]+) ]]; then + NET_IP=${BASH_REMATCH[1]} + MATCH_FOUND=0 + for IP in $(weka local resources -C ${WEKA_CONTAINER} --stable | grep -e ^"Management IPs" | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+"); do + if [[ "${IP}" == "${NET_IP}" ]]; then + MATCH_FOUND=1 + break + fi + done + if [[ ${MATCH_FOUND} -eq 0 ]]; then + echo "WARN: Dataplane NIC ${NET_NAME} has IP ${NET_IP}, but this does not appear in the ${WEKA_CONTAINER} container's resources" + RETURN_CODE=254 + fi + fi + fi + done < <(weka local resources -C ${WEKA_CONTAINER} net --stable -J | grep -w -e name -e ips | paste - - | tr -d \"\,[:blank:]) + + MGMT_IP_COUNT=$(weka local resources -C ${WEKA_CONTAINER} --stable | grep -e ^"Management IPs" | grep -o "[0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+" | wc -l) + if [[ ${SINGLE_IP} -eq 1 ]] && [[ ${NET_COUNT} -ne ${MGMT_IP_COUNT} ]]; then + echo "WARN: Container ${WEKA_CONTAINER} has ${NET_COUNT} dataplane NICs, but ${MGMT_IP_COUNT} management IPs" + RETURN_CODE=254 + + # Ostensibly this should only occur in cloud environments? + elif [[ ${SINGLE_IP} -eq 0 ]] && [[ ${MGMT_IP_COUNT} -gt 1 ]]; then + echo "WARN: Container ${WEKA_CONTAINER} has ${MGMT_IP_COUNT} management IPs" + RETURN_CODE=254 + fi +done + +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "All Weka dataplane NICs have valid management IPs" +else + echo "Recommended Resolution: assign an appropriate set of management ips for each container." + echo "There should be one management IP per dataplane NIC. Management IPs can be set" + echo "by running the following command" + echo " weka local resources --container management-ips " +fi + +exit ${RETURN_CODE} \ No newline at end of file diff --git a/install/scripts.d/ta/835_s2o_unmigrated.sh b/install/scripts.d/ta/835_s2o_unmigrated.sh new file mode 100644 index 0000000..f56372a --- /dev/null +++ b/install/scripts.d/ta/835_s2o_unmigrated.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Check if a filesystem is marked as downloaded in the snapViews." +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="single" + +RETURN_CODE=0 + +# Check if we can run weka commands +weka status &> /dev/null +if [[ $? -ne 0 ]]; then + echo "ERROR: Not able to run weka commands" + exit 254 +elif [[ $? -eq 127 ]]; then + echo "WEKA not found" + exit 254 +elif [[ $? -eq 41 ]]; then + echo "Unable to login into Weka cluster." + exit 254 +fi + +# Sample output +# name:isDownloaded:noguid:00000000-0000-0000-0000-000000000000isDownloaded:noguid:00000000-0000-0000-0000-000000000000 +# name:fs1-snap2isDownloaded:yesguid:192c3fcd-71be-4b94-b7c6-4b39d3c06545isDownloaded:noguid:00000000-0000-0000-0000-000000000000 + +while read SNAPVIEW; do + if [[ ${SNAPVIEW} =~ "name:"(.*)"isDownloaded:yesguid:"([[:alnum:]\-]+)"isDownloaded:" ]]; then + FS_NAME=${BASH_REMATCH[1]} + GUID=${BASH_REMATCH[2]} + + echo "WARN: Filesystem ${FS_NAME} may have been restored via Snap2Obj, but not migrated to its own bucket" + echo " This filesystem was downloaded from the cluster with GUID ${GUID}" + RETURN_CODE=254 + fi +done < <(weka debug config show snapViews | egrep -w '(name|isDownloaded|guid)' | paste - - - - - | tr -d \"\,[:blank:]) + +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "No unmigrated Snap2Obj filesystems detected" +else + echo "Recommended Resolution: perform a bucket migration, or a bucket detach, of the affected filesystems" +fi + +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/845_mem_alloc.sh b/install/scripts.d/ta/845_mem_alloc.sh new file mode 100644 index 0000000..4dd82da --- /dev/null +++ b/install/scripts.d/ta/845_mem_alloc.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Check if Weka is using an excessive amount of total system memory." +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="parallel" + +RETURN_CODE=0 + + +RSS=$(ps -o rsz -C wekanode | awk '{sum+=$1};END {print sum/1024/1024}') +HUGEPAGES=$(ls -la /opt/weka/data/agent/containers/state/*/huge{,1G}/* | awk '{hugepages+=$5}; END {print hugepages/1024/1024/1024}') + +TOTAL_SYS=$(free -g | awk '/Mem/{print $2}') +TOTAL_NON_WEKA=$(awk -v v1=$RSS -v v2=$HUGEPAGES -v v3=$TOTAL_SYS 'BEGIN {print int(v3-(v1+v2))}') + +if [[ ${TOTAL_NON_WEKA} -lt 8 ]]; then + echo "WARN: Less than 8 GiB (${TOTAL_NON_WEKA} GiB) of memory free for non-Weka related processes" + echo "Recommended Resolution: review the system memory requirements at docs.weka.io" + RETURN_CODE=254 +fi + +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "More than 8 GiB (${TOTAL_NON_WEKA} GiB) of memory not allocated to Weka" +fi + +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/850_heartbeat_gt_cluster_lease.sh b/install/scripts.d/ta/850_heartbeat_gt_cluster_lease.sh new file mode 100644 index 0000000..b0e052e --- /dev/null +++ b/install/scripts.d/ta/850_heartbeat_gt_cluster_lease.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="Enforce heartbeat_grace_msec is greater than cluster_lease" +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="single" + +RETURN_CODE=0 + + +HB_GRACE_DEFAULT=$( weka debug override list-keys --filter key=heartbeat_grace_msecs --output defaultValue --no-header | head -n1) +CL_TIMEOUT_DEFAULT=$(weka debug override list-keys --filter key=cluster_lease_timeout_msecs --output defaultValue --no-header | head -n1) +HB_GRACE_MANUAL=$( weka debug override list --filter key=heartbeat_grace_msecs --output value --no-header) +CL_TIMEOUT_MANUAL=$( weka debug override list --filter key=cluster_lease_timeout_msecs --output value --no-header) +HB_GRACE=${HB_GRACE_MANUAL:-${HB_GRACE_DEFAULT}} +CL_TIMEOUT=${CL_TIMEOUT_MANUAL:-${CL_TIMEOUT_DEFAULT}} + +# enforce heartbeat_grace_msecs > cluster_lease_timeout_msecs +if [[ ${CL_TIMEOUT} -ge ${HB_GRACE} ]]; then + echo "WARN: cluster_lease_timeout_msecs (${CL_TIMEOUT}) is greater than or equal to heartbeat_grace_msecs (${HB_GRACE})" + echo "This may be because one value has been left at a default, but this configuration" + echo "might prevent cluster status being propagated correctly." + echo "To rectify, ensure that heartbeat_grace_msecs is greater than cluster_lease_timeout_msecs" + RETURN_CODE=254 +fi + +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "heartbeat_grace_msecs is greater than cluster_lease_timeout_msecs" +fi + +exit ${RETURN_CODE} diff --git a/install/scripts.d/ta/855_nfsw_fips_sanity.sh b/install/scripts.d/ta/855_nfsw_fips_sanity.sh new file mode 100644 index 0000000..0c1a273 --- /dev/null +++ b/install/scripts.d/ta/855_nfsw_fips_sanity.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +#set -ue # Fail with an error code if there's any sub-command/variable error + +DESCRIPTION="NFSW FIPs sanity" +# script type is single, parallel, sequential, or parallel-compare-backends +SCRIPT_TYPE="parallel" + +RETURN_CODE=0 + +# Check if we can run weka commands +if ! weka status &> /dev/null; then + echo "ERROR: Not able to run weka commands" + exit 254 +fi + +case $? in + 127) + echo "WEKA not found" + exit 254 + ;; + 41) + echo "Unable to login to Weka cluster." + exit 254 + ;; +esac + +CONTAINER_ID=$(weka cluster container -F hostname=$(hostname),container=frontend0 --no-header -o id) +PROCESS_ID=$(weka cluster process -F hostname=$(hostname),container=frontend0,role=frontend --no-header -o id | head -n1) + +# Only perform these checks from those hosts that are part of an NFS interface group +while read FIP_IP FIP_HOST_LABEL FIP_HOST FIP_INTERFACE; do + + ############################# + # LOCAL FIP EXISTENCE CHECK # + ############################# + if ip addr show dev ${FIP_INTERFACE} | grep -q ${FIP_IP}; then + + FIP_MAC=$(ip addr show dev ${FIP_INTERFACE} | grep -oE '([[:xdigit:]]{2}[:]){5}[[:xdigit:]]{2}' | head -n1) + if [[ -n ${FIP_MAC} ]]; then + + ################### + # ARP CACHE CHECK # + ################### + FIP_MAC_ARP=$(ip neigh | grep ${FIP_IP} | awk '{print $5}') + + if [[ -n ${FIP_MAC_ARP} ]]; then + if [[ ${FIP_MAC,,} != ${FIP_MAC_ARP,,} ]]; then + echo "WARN: FIP ${FIP_IP} on interface with MAC ${FIP_MAC}, but also ${FIP_MAC_ARP} in arp cache" + RETURN_CODE=254 + fi + fi + + ################ + # ARPING CHECK # + ################ + if which arping &>/dev/null; then + if strings $(which arping) | grep -q iputils; then + FIP_MAC_ARP=$(arping -I ${FIP_INTERFACE} -c 5 -w 5 ${FIP_IP} | grep -m 1 -oE '([[:xdigit:]]{2}[:]){5}[[:xdigit:]]{2}') + + # The "other" arping? https://github.com/ThomasHabets/arping + else + FIP_MAC_ARP=$(arping -i ${FIP_INTERFACE} -c 5 ${FIP_IP} | grep -m 1 -oE '([[:xdigit:]]{2}[:]){5}[[:xdigit:]]{2}') + fi + + if [[ -n ${FIP_MAC_ARP} ]]; then + if [[ ${FIP_MAC,,} != ${FIP_MAC_ARP,,} ]]; then + echo "WARN: FIP ${FIP_IP} on interface with MAC ${FIP_MAC}, but also ${FIP_MAC_ARP} in arp cache" + RETURN_CODE=254 + fi + fi + + else + echo "INFO: arping not installed" + fi + fi + else + echo "WARN: Unable to locate FIP ${FIP_IP} on interface ${FIP_INTERFACE}" + RETURN_CODE=254 + fi +done < <(weka nfs interface-group assignment --no-header -o ip,host,port | awk -v container_id="${CONTAINER_ID}" '$3 == container_id') + + +# Comparing the local state and cluster state tables is too difficult without jq +if jq --version &> /dev/null; then + while read FIP_IP FIP_STALE FIP_STATUS; do + # Only care about entries in "OK" status? + if [[ ${FIP_STATUS} == "OK" ]]; then + # Does the FIP exist on a local interface? + if ip addr show | grep -q ${FIP_IP}; then + # Does the FIP appear in the global table for this host? + if ! weka debug manhole get_aggregated_cluster_status table_names="floatingIps" --node $(weka cluster process -L --no-header -o id) | jq -cr '(.floatingIps|to_entries[]|[.key, (.value|.isStale,.status,.serial.sourceNodeId)])|@tsv' | awk '/OK/ && /NodeId<${PROCESS_ID}>/'; then + echo "WARN: Global state FIP ${FIP_IP} not found for process ${PROCESS_ID}" + RETURN_CODE=254 + fi + else + echo "WARN: Local state FIP ${FIP_IP} not assigned to interface" + RETURN_CODE=254 + fi + fi + done < <(weka debug manhole get_localstate table_names="floatingIps" -n ${PROCESS_ID} | jq -cr '(.floatingIps|to_entries[]|[.key, (.value|.isStale,.status)])|@tsv') +fi + + +if [[ ${RETURN_CODE} -eq 0 ]]; then + echo "NFSW FIPs sanity check passed." +fi + +exit ${RETURN_CODE} diff --git a/wekatester/wekatester b/wekatester/wekatester index 31dfd80..16a3732 100755 Binary files a/wekatester/wekatester and b/wekatester/wekatester differ