Skip to content
This repository has been archived by the owner on Oct 11, 2023. It is now read-only.

Commit

Permalink
Init repo
Browse files Browse the repository at this point in the history
  • Loading branch information
guunergooner committed Feb 11, 2019
0 parents commit f2d0525
Show file tree
Hide file tree
Showing 9 changed files with 576 additions and 0 deletions.
28 changes: 28 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
conf_path=/etc/zabbix/zabbix_agent.d
scripts_path=/etc/zabbix/scripts

all:
@echo "usage: make install"

install: install-requirement install-agent-config install-scripts set-config

install-requirement:
yum clean all && rpm -Uvh \
https://repo.zabbix.com/zabbix/3.0/rhel/7/x86_64/zabbix-release-3.0-1.el7.noarch.rpm
yum -y install zabbix-agent python-pip
pip install --upgrade pip && pip install nvidia-ml-py

install-agent-config:
install -o root -g root -m 644 userparameter_nvidia-smi.conf /etc/zabbix/zabbix_agentd.d

install-scripts:
install -d -o root -g root -m 755 ${scripts_path}
install -o root -g root -m 755 \
get_gpus_info.sh nvidia-ml.py set_zabbix_config.sh ${scripts_path}

set-config:
bash ${scripts_path}/set_zabbix_config.sh

clean:
test ! -d ${conf_path} || rm -rf ${conf_path}
test ! -d ${scripts_path} || rm -rf ${scripts_path}
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
* install
- sudo make install

* import zabbix template
- import zbx_nvidia-smi-multi-gpu.xml to zabbix Templates
- create GPU-Number and GPU-Avg-Utilization Graphs
- select host and Add Template Nvidia GPUs Performance

* restart zabbix-agentd
- bash restart.sh
26 changes: 26 additions & 0 deletions get_gpus_info.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

result=$(python /etc/zabbix/scripts/nvidia-ml.py --summary)
first=1

echo "{"
echo "\"data\":["

while IFS= read -r line
do
if (( "$first" != "1" ))
then
echo ,
fi
index=$(echo -n $line | cut -d ":" -f 1 | cut -d " " -f 2)
gpuuuid=$(echo -n $line | cut -d ":" -f 3 | tr -d ")" | tr -d " ")
echo -n {"\"{#GPUINDEX}"\":\"$index"\", \"{#GPUUUID}"\":\"$gpuuuid\"}
if (( "$first" == "1" ))
then
first=0
fi
done < <(printf '%s\n' "$result")

echo
echo "]"
echo "}"
26 changes: 26 additions & 0 deletions install_zbx_agent.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

function log () {
echo $(date +"[%Y-%m-%d %H:%M:%S]") $@
}

if [ $# -lt 1 ]; then
log "Usage: sh $0 host"
exit
fi

host=$1
user=root

function install_zabbix_agentd () {
log "mkdir"
ssh -t ${user}@${host} "mkdir /home/admin/zabbix && chown -R admin:admin /home/admin/zabbix"

log "copy file"
scp ./* ${user}@${host}:/home/admin/zabbix

log "exec install"
ssh -t ${user}@${host} "cd /home/admin/zabbix && make install && sh restart.sh"
}

install_zabbix_agentd
135 changes: 135 additions & 0 deletions nvidia-ml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/local/bin python
# coding=utf-8

import optparse, sys, string
from pynvml import *

class OptionClass:
def __init__(self):
self.id = None
self.properties = None
self.number = None
self.summary = None
self.avgGpuUtil = None
self.helpProperties = None
self.parser = None

def parse(self):
option_list = [
optparse.make_option("-i", "--id",
action="store", type="string", dest="id",
help="Specific GPU unit id"),
optparse.make_option("-p", "--properties",
action="store", type="string", dest="properties",
help="Query GPU properties"),
optparse.make_option("--number",
action="store_true", dest="number", help="Number of GPUs"),
optparse.make_option("--summary",
action="store_true", dest="summary", help="Summary list GPUs"),
optparse.make_option("--avg-gpu-util",
action="store_true", dest="avgGpuUtil", help="Average GPU utilization"),
optparse.make_option("--help-properties",
action="store_true", dest="helpProperties", help="Help properties of GPUs"),
]

self.parser = optparse.OptionParser(option_list=option_list)
(options, args) = self.parser.parse_args()

if options.id is not None:
self.id = int(options.id)
if options.properties is not None:
self.properties = options.properties
if options.number is not None:
self.number = options.number
if options.summary is not None:
self.summary = options.summary
if options.avgGpuUtil is not None:
self.avgGpuUtil = options.avgGpuUtil
if options.helpProperties is not None:
self.helpProperties = options.helpProperties

def printHelpProperties(self):
print("--properties=utilization.gpu", "Percent of executing on the GPU")
print("--properties=memory.used", "Percent of used memory on the GPU")

def validate(self):
if self.helpProperties:
self.printHelpProperties()
sys.exit(0)

if self.number or self.summary or self.avgGpuUtil:
pass
return

if self.id is None or self.properties is None:
self.parser.print_help()
sys.exit(1)

class NvmlClass:
def __init__(self):
nvmlInit()

def __del__(self):
nvmlShutdown()

def getDeviceNumber(self):
deviceCount = nvmlDeviceGetCount()
return deviceCount

def getDeviceSummary(self):
summaryList = []
deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
handle = nvmlDeviceGetHandleByIndex(i)
name = nvmlDeviceGetName(handle)
uuid = nvmlDeviceGetUUID(handle)
info = {"id":i, "name":name, "uuid": uuid}
summaryList.append(info)
return summaryList

def getDeviceUtilizationGPU(self, id):
handle = nvmlDeviceGetHandleByIndex(int(id))
util = nvmlDeviceGetUtilizationRates(handle)
return int(util.gpu)

def getDeviceUtilizationGPUAvg(self):
deviceCount = nvmlDeviceGetCount()
util_gpu = 0.0
for i in range(deviceCount):
handle = nvmlDeviceGetHandleByIndex(i)
util = nvmlDeviceGetUtilizationRates(handle)
util_gpu += util.gpu

return int(util_gpu / deviceCount)

def getDeviceMemoryUsed(self, id):
handle = nvmlDeviceGetHandleByIndex(int(id))
mem_info = nvmlDeviceGetMemoryInfo(handle)
return int(float(mem_info.used) / float(mem_info.total) * 100)

def main():
option = OptionClass()
option.parse()
option.validate()

nvml = NvmlClass()
if option.number:
deviceCount = nvml.getDeviceNumber()
print("GPU number:%d" % deviceCount)
elif option.summary:
for summary in nvml.getDeviceSummary():
print("GPU %d: %s (UUID: %s)") % \
(summary['id'], summary['name'], summary['uuid'])
elif option.avgGpuUtil:
print("GPU avg util:%d" % nvml.getDeviceUtilizationGPUAvg())
elif option.properties == "utilization.gpu":
print("GPU %d util:%d") % (option.id, nvml.getDeviceUtilizationGPU(option.id))
elif option.properties == "memory.used":
print("GPU %d mem used:%d") % (option.id, nvml.getDeviceMemoryUsed(option.id))
else:
print("Invalid properties:", option.properties)
option.printHelpProperties()
sys.exit(1)

if __name__ == "__main__":
main()
10 changes: 10 additions & 0 deletions restart.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

#stop
pids=$(ps -ef | grep zabbix_agentd | grep -v 'grep' | awk '{print $2}' | xargs)
if [ ! -z "${pids}" ];then
kill -9 ${pids}
fi

#start
/usr/sbin/zabbix_agentd
14 changes: 14 additions & 0 deletions set_zabbix_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
modify_user=$(whoami)
modify_date=$(date +%Y-%m-%d)
monitor_host=10.88.128.40
hosts_conf=/etc/hosts
zabbix_agentd_conf=/etc/zabbix/zabbix_agentd.conf

sed -i "s/$(grep 'AllowRoot=' ${zabbix_agentd_conf})/AllowRoot=1/g" ${zabbix_agentd_conf}
sed -i "s/$(grep 'Server=' ${zabbix_agentd_conf} | grep -v '#')/Server=monitor.dev.rokid-inc.com/g" ${zabbix_agentd_conf}
sed -i "s/$(grep 'ServerActive=' ${zabbix_agentd_conf} | grep -v '#')/ServerActive=monitor.dev.rokid-inc.com/g" ${zabbix_agentd_conf}

echo -e "\n" >> ${hosts_conf}
echo "#add by ${modify_user} ${modify_date}" >> ${hosts_conf}
echo "${monitor_host} monitor.dev.rokid-inc.com" >> ${hosts_conf}
5 changes: 5 additions & 0 deletions userparameter_nvidia-smi.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
UserParameter=gpu.number,python /etc/zabbix/scripts/nvidia-ml.py --number | cut -d ":" -f2
UserParameter=gpu.avgutilization,python /etc/zabbix/scripts/nvidia-ml.py --avg-gpu-util | cut -d ":" -f2
UserParameter=gpu.discovery,/etc/zabbix/scripts/get_gpus_info.sh
UserParameter=gpu.utilization[*],python /etc/zabbix/scripts/nvidia-ml.py -i $1 -p utilization.gpu | cut -d ":" -f2
UserParameter=gpu.memoryused[*],python /etc/zabbix/scripts/nvidia-ml.py -i $1 -p memory.used | cut -d ":" -f2
Loading

0 comments on commit f2d0525

Please sign in to comment.