snmp helper script, pass_persist
long post.. I'm posting this more as an informational; this is what we did to collect everything through snmp rather than open up any ports like ssh... If you have ideas on making it better I'm all ears!
Nothing fancy.. I modified the code from the real world example that snmp pass_persist gives on their page... the main script loads stuff from a YAML config file then calls each python module, passing the snmp pass_persist object (pp) and config object over to them.
The modules will run a quick test to see if they actually need to run... for example if the machine doesn't have DNS, then the dns_stats doesn't need to run.
The File Monitor and DNS Stats are capable of monitoring multiple instances.
Things that I want still:
-- The file monitor oids will change for files if a file is added or removed.. so maybe we set LM to do autodiscovery based on a key or lookup instead ?!
-- The above applies to software raids.. may need a state file to cache things in, or again.. key/lookups instead.
-- dns_stats calls an external bash script to do the parsing because my python woo is very weak
-- clean up / insert comments everywhere !
snmpd.conf (add this line to the bottom)
pass_persist .1.3.6.1.4.1.6556 /usr/share/snmp/extensions/lm/logicmonitor_helper.py
config.yml: (/usr/share/snmp/extensions/lm/)
---
dns:
instances:
- name: 'Main DNS'
config: '/etc/named.conf'
stats: '/var/named/data/named_stats.txt'
- name: 'Special voodoo DNS'
config: '/etc/named.voodoo.conf'
stats: '/var/named.voodoo/data/named_stats.txt'
file_monitor:
directories:
- '/var/opt/logs/servers'
- '/var/opt/logs/network'
Main script: (/usr/share/snmp/extensions/lm/LogicMonitorHelper.py)
#!/usr/bin/python -u
# -*- coding:Utf-8 -*-
# Option -u is needed for communication with snmpd
import sys
sys.path.append('/usr/share/snmp/extensions/lm/libraries')
import snmp_passpersist as snmp
import os, re, socket, syslog, time, errno
import LM_Config as cfg
import LM_Software_Raid
import LM_FileMonitor
import LM_dns_stats
# Global vars
pp = None
config = cfg.loadConfig()
VER = "Logic Monitor SNMP Helper v0.1"
POLLING_INTERVAL = 60
MAX_RETRY = 5
OID_BASE = ".1.3.6.1.4.1.6556"
def update_data():
pp.add_str('0.0',VER)
## This is needed because the variable isn't getting reset to 0 inside the modules
LM_Software_Raid.doUpdate(pp)
LM_FileMonitor.doUpdate(pp,config)
LM_dns_stats.doUpdate(pp,config)
def main():
syslog.openlog(sys.argv[0],syslog.LOG_PID)
retry_timestamp=int(time.time())
retry_counter=MAX_RETRY
while retry_counter>0:
try:
global pp
syslog.syslog(syslog.LOG_INFO,"Starting Logic Monitor SNMP Helper...")
# Load helpers
pp=snmp.PassPersist(OID_BASE)
pp.start(update_data,POLLING_INTERVAL) # Should'nt return (except if updater thread has died)
except KeyboardInterrupt:
print "Exiting on user request."
sys.exit(0)
except IOError, e:
if e.errno == errno.EPIPE:
syslog.syslog(syslog.LOG_INFO,"Snmpd had close the pipe, exiting...")
sys.exit(0)
else:
syslog.syslog(syslog.LOG_WARNING,"Updater thread as died: IOError: %s" % (e))
except Exception, e:
syslog.syslog(syslog.LOG_WARNING,"Main thread as died: %s: %s" % (e.__class__.__name__, e))
else:
syslog.syslog(syslog.LOG_WARNING,"Updater thread as died: %s" % (pp.error))
syslog.syslog(syslog.LOG_WARNING,"Restarting monitoring in 15 sec...")
time.sleep(15)
# Errors frequency detection
now=int(time.time())
if (now - 3600) > retry_timestamp: # If the previous error is older than 1H
retry_counter=MAX_RETRY # Reset the counter
else:
retry_counter-=1 # Else countdown
retry_timestamp=now
syslog.syslog(syslog.LOG_ERR,"Too many retry, abording... Please check if xen is running !")
sys.exit(1)
if __name__ == "__main__":
main()
LM_Config.py: (/usr/share/snmp/extensions/lm/libraries)
#!/usr/bin/python
import sys
sys.path.append('/usr/share/snmp/extensions/lm/libraries')
import yaml
file = '/usr/share/snmp/extensions/lm/config.yml'
def loadConfig():
try:
with open(file) as f:
return yaml.load(f)
except:
print(file + " doesn't exist")
LM_Software_Raid.py: (/usr/share/snmp/extensions/lm/libraries)
#!/usr/bin/python
import sys
sys.path.append('/usr/share/snmp/extensions/lm/libraries')
import re
import mdstat
import json
import argparse
import LM_Config as cfg
md = mdstat.parse()
def testModule():
config = cfg.loadConfig()
if len(md['devices']) > 0:
return True
else:
return False
def to_bool(*args):
try:
if args[1] == "rev":
if args[0] == True or args[0] == None:
return 1
elif args[0] == False:
return 0
except:
if args[0] == True or args[0] == None:
return 0
elif args[0] == False:
return 1
else:
return s
def doUpdate(pp):
if testModule():
# headers for each section in the for loop(s)
pp.add_str('1.0',"md name")
pp.add_str('1.1',"md status")
# 1.2 is in the next loop
pp.add_str('1.3',"md raid type")
pp.add_str('1.4',"Disks in software raids")
pp.add_str('1.5',"Disks Faulty Status")
pp.add_str('1.6',"DiskX belongs to mdX")
i = 0
for mdCounter,mdValue in enumerate(sorted(md['devices'])):
# 1.0.x md names
pp.add_str('1.0.' + str(mdCounter),mdValue)
# 1.1.x md status
pp.add_str('1.1.' + str(mdCounter),to_bool(md['devices'][mdValue]['active']))
# 1.2.x.0.0 md disk count
pp.add_str('1.2.' + str(mdCounter) + '.0.0', mdValue + " disk count")
# 1.2.x.1.0 md non faulted disk count
pp.add_str('1.2.' + str(mdCounter) + '.1.0', mdValue + " non faulted disk count")
# 1.3.x md type
pp.add_str('1.3.' + str(mdCounter),md['devices'][mdValue]['personality'])
fault = 0; count = 0
for dCounter,dValue in enumerate(md['devices'][mdValue]['disks']):
pp.add_str('1.4.' + str(i),dValue)
disk_fault=to_bool(md['devices'][mdValue]['disks'][dValue]['faulty'],"rev")
fault = fault + to_bool(md['devices'][mdValue]['disks'][dValue]['faulty'])
pp.add_str('1.5.' + str(i),disk_fault)
pp.add_str('1.6.' + str(i), mdValue)
i = i + 1
count = count + 1
# 1.2.x.0.x & 1.2.x.1.x
pp.add_str('1.2.' + str(mdCounter) + '.0.1', count)
pp.add_str('1.2.' + str(mdCounter) + '.1.1', fault)
LM_FileMonitor.py: (/usr/share/snmp/extensions/lm/libraries)
#!/usr/bin/python
import sys
sys.path.append('/usr/share/snmp/extensions/lm/libraries')
import os
import time
import glob
from stat import *
dirs = []
# Nice little function to convert True / None to 0 and False to 1
def to_bool(s):
if s == True or s == None:
return 0
elif s == False:
return 1
else:
return s
def testModule(config):
for dir in config['file_monitor']['directories']:
if os.path.isdir(dir):
dirs.append(dir)
if len(dirs) > 0:
return True
else:
return False
def doUpdate(pp,config):
# Test if we should run this module and return anything
if testModule(config):
x = 0
pp.add_str('2.0.0',"Directories")
pp.add_str('2.0.1',dirs)
for dir in dirs:
os.chdir(dir)
pp.add_str('2.2.0', "Last Check Time (epoch)")
pp.add_str('2.2.1', time.time())
pp.add_str('2.2.3', "Files")
pp.add_str('2.2.4', "Modification Time at last check (epoch)")
pp.add_str('2.2.5', "Full Path")
pp.add_str('2.2.6', "File size")
pp.add_str('2.2.7', "Created (epoch)")
pp.add_str('2.2.8', "Access Time (epoch)")
for counter,value in enumerate(glob.glob("*")):
statinfo = os.stat(value)
pp.add_str('2.2.3.' + str(counter), value)
pp.add_str('2.2.4.' + str(counter), str(statinfo[8]))
pp.add_str('2.2.5.' + str(counter), dir + '/' + value)
pp.add_str('2.2.6.' + str(counter), str(statinfo[6]))
pp.add_str('2.2.7.' + str(counter), str(statinfo[9]))
pp.add_str('2.2.8.' + str(counter), str(statinfo[7]))
x = x + 1
else:
pp.add_str('2',"LM_FileMonitor module did not find any directories to monitor")
LM_dns_stats.py: (/usr/share/snmp/extensions/lm/libraries)
#!/usr/bin/python
import sys
sys.path.append('/usr/share/snmp/extensions/lm/libraries')
import os
import re
import subprocess
dns_stats = []
dns_desc = []
def testModule(config):
for instance in config['dns']['instances']:
stats_file = instance['stats']
if os.path.exists(stats_file) and os.access(stats_file, os.R_OK):
dns_stats.append(stats_file)
dns_desc.append(instance['name'])
if len(dns_stats) > 0:
return True
else:
return False
def doUpdate(pp,config):
if testModule(config):
i,dnsx = 0,0
for stats_file in dns_stats:
if dnsx == len(config['dns']['instances']):
dnsx = 0
pp.add_str('3.0.' + str(dnsx), str(dns_desc[dnsx]))
pp.add_str('3.1.' + str(dnsx),"Stats from " + stats_file)
script = "/usr/share/snmp/extensions/lm/helper_scripts/stats.sh"
incoming = subprocess.check_output([script, stats_file, 'incoming'])
outgoing = subprocess.check_output([script, stats_file, 'outgoing'])
resolver = subprocess.check_output([script, stats_file, 'resolver'])
socket = subprocess.check_output([script, stats_file, 'socket'])
## Incoming
i = 0
for s in re.split('\n',incoming):
data = re.split(',', s)
if len(data[0]) > 0 and data[1] > 0:
pp.add_str('3.2.0.0.' + str(i), data[0])
pp.add_str('3.2.1.' + str(dnsx) + '.' + str(i), data[1])
i = i + 1
## Outgoing
i = 0
for s in re.split('\n',outgoing):
data = re.split(',', s)
if len(data[0]) > 0:
pp.add_str('3.3.0.0.' + str(i), data[0])
pp.add_str('3.3.1.' + str(dnsx) + '.' + str(i), data[1])
i = i + 1
## resolver
i = 0
for s in re.split('\n',resolver):
data = re.split(',', s)
if len(data[0]) > 0:
pp.add_str('3.4.0.0.' + str(i), data[0])
pp.add_str('3.4.1.' + str(dnsx) + '.' + str(i), data[1])
i = i + 1
## socket
i = 0
for s in re.split('\n',socket):
data = re.split(',', s)
if len(data[0]) > 0:
pp.add_str('3.5.0.0.' + str(i), data[0])
pp.add_str('3.5.1.' + str(dnsx) + '.' + str(i), data[1])
i = i + 1
dnsx = dnsx + 1
else:
pp.add_str('3' + "LM_dns_stats module did not find any bind intances from the config file")
stats.sh: (/usr/share/snmp/extensions/lm/helper_scripts/)
#!/bin/bash
file=${1}
[ -x ${file} ] && exit 0
[ -f ${file} ] || exit 0
dnsNames=(A A6 AAAA ANY CNAME DNSKEY DS MX NAPTR NS PTR SOA SPF SRV TXT)
resNames=('mismatch responses received' 'IPv4 queries sent' 'IPv4 responses received' 'NXDOMAIN received' 'SERVFAIL received' 'FORMERR received' 'query retries' 'query timeouts' 'queries with RTT < 10ms' 'queries with RTT 10-100ms' 'queries with RTT 100-500ms' 'queries with RTT 500-800ms' 'queries with RTT 800-1600ms' 'queries with RTT > 1600ms')
sockNames=('UDP/IPv4 sockets opened' 'UDP/IPv4 sockets closed' 'UDP/IPv4 socket bind failures' 'UDP/IPv4 connections established' 'UDP/IPv4 recv errors' 'TCP/IPv4 sockets opened' 'TCP/IPv4 sockets closed' 'TCP/IPv4 socket bind failures' 'TCP/IPv4 connections established' 'TCP/IPv4 recv errors')
now_epoch=$(date +%s)
mtime_epoch=$(stat ${file} -c %W)
function updateStats {
[ -f ${file} ] && [ $((now_epoch-mtime_epoch)) -gt 300 ] && rm -f $file && rndc stats
}
function getStats {
start=$1
end=$2
regx1="sed -n '/${start}/,/${end}/p'"
data=$(cat ${file} | \
eval ${regx1} | \
egrep '[0-9]' | \
awk '{ print $1" "$2 }')
while read value name; do
names+=(${name})
values+=("${value}")
done <<< "${data}"
for n in $(eval echo \${${3}[@]}); do
regx2="${n}"
if [[ ! "${names[@]}" =~ "$regx2" ]]; then
names+=(${n})
values+=("0")
fi
done
x=0
for n in ${names[@]}; do
echo "${names[$x]},${values[$x]}"
x=$((x+1))
done
}
function inStats {
getStats "Incoming Q" "Outgoing Q" "dnsNames"
}
function outStats {
getStats "Outgoing Q" "^+" "dnsNames"
}
function resStats {
for n in "${resNames[@]}"; do
regexp=" ${n}$"
name=$(echo ${n})
value=$(egrep "${regexp}" ${file} | \
sed -n 's/.* \([0-9]*\) \([A-Za-z].*\)/\1/p')
[ ${#value} -eq 0 ] && value=0
echo "${name},${value}"
done
}
function resSocket {
for n in "${sockNames[@]}"; do
regexp=" ${n}$"
value=$(egrep "${regexp}" ${file} | \
sed -n 's/.* \([0-9]*\) \([A-Za-z].*\)/\1/p')
[ ${#value} -eq 0 ] && value=0
echo "${n},${value}"
done
}
# all returns get the timestamps..
echo "stats_epoch,${mtime_epoch}"
echo "now_epoch,${now_epoch}"
case ${2} in
incoming) inStats;;
outgoing) outStats;;
resolver) resStats;;
socket) resSocket;;
*) inStats;;
esac
# Call createStats last.. if we destroy first then query the file, values will be small to 0 as the values start at time of stats creation. If we call at the end, the next polling will have data during the time period from poll to poll.
updateStats
exit 0