diff --git a/README.md b/README.md index d739c7a..6c2c274 100644 --- a/README.md +++ b/README.md @@ -102,9 +102,9 @@ The ``icinga2`` folder contains the command definition and service examples for ``` usage: check_pve.py [-h] -e API_ENDPOINT [--api-port API_PORT] -u API_USER (-p API_PASSWORD | -t API_TOKEN) [-k] -m - {cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup} [-n NODE] [--name NAME] - [--vmid VMID] [--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION] - [--unit {GB,MB,KB,GiB,MiB,KiB,B}] + {cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup} + [-n NODE] [--name NAME] [--vmid VMID] [--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [--ignore-pools NAME] + [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION] [--unit {GB,MB,KB,GiB,MiB,KiB,B}] Check command for PVE hosts via API @@ -135,6 +135,7 @@ Check Options: --ignore-service NAME Ignore service NAME in checks --ignore-disk NAME Ignore disk NAME in health check + --ignore-pools NAME Ignore vms and containers in pool(s) NAME in checks -w THRESHOLD_WARNING, --warning THRESHOLD_WARNING Warning threshold for check value. Mutiple thresholds with name:value,name:value -c THRESHOLD_CRITICAL, --critical THRESHOLD_CRITICAL @@ -258,7 +259,7 @@ WARNING - Ceph Cluster is in warning state **Check ZFS pool health** ``` -./check_pve.py -u -p -e -m zfs-health -n pve +./check_pve.py -u -p -e -m zfs-health -n pve OK - All ZFS pools are healthy ``` diff --git a/check_pve.py b/check_pve.py index 7f7ef5b..80ce8fa 100755 --- a/check_pve.py +++ b/check_pve.py @@ -27,7 +27,7 @@ import re import sys -from typing import Callable, Dict, Optional, Union +from typing import Callable, Dict, Optional, Union, List try: import argparse @@ -126,6 +126,16 @@ def threshold_type(arg: str) -> Dict[str, "CheckThreshold"]: return thresholds +class RequestError(Exception): + """Exception for request related errors.""" + + def __init__(self, message: str, rc: int) -> None: + self.message = message + self.rc = rc + + super().__init__(self.message) + + class CheckPVE: """Check command for Proxmox VE.""" @@ -209,6 +219,9 @@ def request(self, url: str, method: str = "get", **kwargs: Dict) -> Union[Dict, else: message += f"HTTP error code was {response.status_code}" + if kwargs.get("raise_error", False): + raise RequestError(message, response.status_code) + self.output(CheckState.UNKNOWN, message) def get_ticket(self) -> str: @@ -664,6 +677,26 @@ def check_version(self) -> None: f"Your PVE instance version '{data['version']}' ({data['repoid']}) is up to date" ) + def _get_pool_members(self, pool: str) -> List[int]: + """Get a list of vmids, which are members of a given resource pool. + + NOTE: The request needs the Pool.Audit permission! + """ + members = [] + + try: + url = self.get_url(f"pools/{pool}") + pools = self.request(url, raise_error=True) + for pool in pools.get("members", []): + members.append(pool["vmid"]) + except RequestError: + print( + f"Unable to fetch members of pool '{pool}'. " + "Check if the name is correct and the role has the 'Pool.Audit' permission" + ) + + return members + def check_vzdump_backup(self, name: Optional[str] = None) -> None: """Check for failed vzdump backup jobs.""" tasks_url = self.get_url("cluster/tasks") @@ -696,13 +729,25 @@ def check_vzdump_backup(self, name: Optional[str] = None) -> None: nbu_url = self.get_url("cluster/backup-info/not-backed-up") not_backed_up = self.request(nbu_url) + if len(not_backed_up) > 0: - guest_ids = " ".join([str(guest["vmid"]) for guest in not_backed_up]) - if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]: - self.check_result = CheckState.WARNING - self.check_message += ( - f"\nThere are guests not covered by any backup schedule: {guest_ids}" - ) + guest_ids = [] + + for guest in not_backed_up: + guest_ids.append(str(guest["vmid"])) + + ignored_vmids = [] + for pool in self.options.ignore_pools: + ignored_vmids += map(str, self._get_pool_members(pool)) + + remaining_not_backed_up = sorted(list(set(guest_ids) - set(ignored_vmids))) + if len(remaining_not_backed_up) > 0: + if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]: + self.check_result = CheckState.WARNING + self.check_message += ( + "\nThere are unignored guests not covered by any backup schedule: " + + ", ".join(remaining_not_backed_up) + ) def check_memory(self) -> None: """Check memory usage of Proxmox VE node.""" @@ -999,6 +1044,15 @@ def parse_args(self) -> None: default=[], ) + check_opts.add_argument( + "--ignore-pools", + dest="ignore_pools", + action="append", + metavar="NAME", + help="Ignore vms and containers in pool(s) NAME in checks", + default=[], + ) + check_opts.add_argument( "-w", "--warning",