Skip to content

Commit

Permalink
Merge pull request #60 from jdreffein/ignore_pools
Browse files Browse the repository at this point in the history
feat: add option --ignore-pools to exclude vms in pools from backup c…
  • Loading branch information
nbuchwitz authored Jun 30, 2024
2 parents 2bb40d3 + 119f3a6 commit 322ab4a
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 11 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ The ``icinga2`` folder contains the command definition and service examples for

```
usage: check_pve.py [-h] -e API_ENDPOINT [--api-port API_PORT] -u API_USER (-p API_PASSWORD | -t API_TOKEN) [-k] -m
{cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup} [-n NODE] [--name NAME]
[--vmid VMID] [--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION]
[--unit {GB,MB,KB,GiB,MiB,KiB,B}]
{cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup}
[-n NODE] [--name NAME] [--vmid VMID] [--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [--ignore-pools NAME]
[-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION] [--unit {GB,MB,KB,GiB,MiB,KiB,B}]
Check command for PVE hosts via API
Expand Down Expand Up @@ -135,6 +135,7 @@ Check Options:
--ignore-service NAME
Ignore service NAME in checks
--ignore-disk NAME Ignore disk NAME in health check
--ignore-pools NAME Ignore vms and containers in pool(s) NAME in checks
-w THRESHOLD_WARNING, --warning THRESHOLD_WARNING
Warning threshold for check value. Mutiple thresholds with name:value,name:value
-c THRESHOLD_CRITICAL, --critical THRESHOLD_CRITICAL
Expand Down Expand Up @@ -258,7 +259,7 @@ WARNING - Ceph Cluster is in warning state

**Check ZFS pool health**
```
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve
OK - All ZFS pools are healthy
```

Expand Down
68 changes: 61 additions & 7 deletions check_pve.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

import re
import sys
from typing import Callable, Dict, Optional, Union
from typing import Callable, Dict, Optional, Union, List

try:
import argparse
Expand Down Expand Up @@ -126,6 +126,16 @@ def threshold_type(arg: str) -> Dict[str, "CheckThreshold"]:
return thresholds


class RequestError(Exception):
"""Exception for request related errors."""

def __init__(self, message: str, rc: int) -> None:
self.message = message
self.rc = rc

super().__init__(self.message)


class CheckPVE:
"""Check command for Proxmox VE."""

Expand Down Expand Up @@ -209,6 +219,9 @@ def request(self, url: str, method: str = "get", **kwargs: Dict) -> Union[Dict,
else:
message += f"HTTP error code was {response.status_code}"

if kwargs.get("raise_error", False):
raise RequestError(message, response.status_code)

self.output(CheckState.UNKNOWN, message)

def get_ticket(self) -> str:
Expand Down Expand Up @@ -664,6 +677,26 @@ def check_version(self) -> None:
f"Your PVE instance version '{data['version']}' ({data['repoid']}) is up to date"
)

def _get_pool_members(self, pool: str) -> List[int]:
"""Get a list of vmids, which are members of a given resource pool.
NOTE: The request needs the Pool.Audit permission!
"""
members = []

try:
url = self.get_url(f"pools/{pool}")
pools = self.request(url, raise_error=True)
for pool in pools.get("members", []):
members.append(pool["vmid"])
except RequestError:
print(
f"Unable to fetch members of pool '{pool}'. "
"Check if the name is correct and the role has the 'Pool.Audit' permission"
)

return members

def check_vzdump_backup(self, name: Optional[str] = None) -> None:
"""Check for failed vzdump backup jobs."""
tasks_url = self.get_url("cluster/tasks")
Expand Down Expand Up @@ -696,13 +729,25 @@ def check_vzdump_backup(self, name: Optional[str] = None) -> None:

nbu_url = self.get_url("cluster/backup-info/not-backed-up")
not_backed_up = self.request(nbu_url)

if len(not_backed_up) > 0:
guest_ids = " ".join([str(guest["vmid"]) for guest in not_backed_up])
if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]:
self.check_result = CheckState.WARNING
self.check_message += (
f"\nThere are guests not covered by any backup schedule: {guest_ids}"
)
guest_ids = []

for guest in not_backed_up:
guest_ids.append(str(guest["vmid"]))

ignored_vmids = []
for pool in self.options.ignore_pools:
ignored_vmids += map(str, self._get_pool_members(pool))

remaining_not_backed_up = sorted(list(set(guest_ids) - set(ignored_vmids)))
if len(remaining_not_backed_up) > 0:
if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]:
self.check_result = CheckState.WARNING
self.check_message += (
"\nThere are unignored guests not covered by any backup schedule: "
+ ", ".join(remaining_not_backed_up)
)

def check_memory(self) -> None:
"""Check memory usage of Proxmox VE node."""
Expand Down Expand Up @@ -999,6 +1044,15 @@ def parse_args(self) -> None:
default=[],
)

check_opts.add_argument(
"--ignore-pools",
dest="ignore_pools",
action="append",
metavar="NAME",
help="Ignore vms and containers in pool(s) NAME in checks",
default=[],
)

check_opts.add_argument(
"-w",
"--warning",
Expand Down

0 comments on commit 322ab4a

Please sign in to comment.