From 86be5f430849fc4e1f3136332dd66ce370144f11 Mon Sep 17 00:00:00 2001 From: htouvet Date: Wed, 19 Mar 2014 10:04:56 +0100 Subject: [PATCH] initial public release --- .gitignore | 1 + check_providers.py | 669 ++++++++++++++++++++++++++++++++++++ samples/check-providers.ini | 16 + 3 files changed, 686 insertions(+) create mode 100644 check_providers.py create mode 100644 samples/check-providers.ini diff --git a/.gitignore b/.gitignore index ded6067..bd0e016 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ nosetests.xml .mr.developer.cfg .project .pydevproject +/.svn diff --git a/check_providers.py b/check_providers.py new file mode 100644 index 0000000..1c3cabd --- /dev/null +++ b/check_providers.py @@ -0,0 +1,669 @@ +#! /usr/bin/python +# -*- coding: UTF-8 -*- +#------------------------------------------------------------------------------- +# Name: check_providers.py +# Purpose: enable/disable shoewall providers based on ICMP reachability +# update openvpn configuration +# Author: htouvet +# +# Created: 03/03/2014 +# Copyright: (c) htouvet 2014 +# Licence: GPL V2 +#------------------------------------------------------------------------------- + +import os +import sys +import subprocess +import logging +import re +import time +import datetime + +import json + +import signal + +from iniparse import RawConfigParser +from optparse import OptionParser + +usage="""\ +%prog -c configfile action + +Check reachability of multiple providers managed by Shorewall +enable or disable the providers based on maximum packets loss or RTT + +action is either : + monitor : monitor in background all providers and enable/disable them + check [all,] : check all or one provider and display reachability + check-json [all,] : check providers and output state as json data +""" + +version = "0.0.1" + +parser=OptionParser(usage=usage,version="%prog " + version) +parser.add_option("-i","--check-interval", dest="check_interval", type=int, default=60, help="Config file full path (default: %default)") +parser.add_option("-p","--ping-count", dest="ping_count", type=int, default=0, help="Override ping count (default: %default)") +parser.add_option("-c","--config", dest="config", default='/etc/check-providers.ini', help="Config file full path (default: %default)") +parser.add_option("-d","--dry-run", dest="dry_run", default=False, action='store_true', help="Dry run (default: %default)") +parser.add_option("-v","--verbose", dest="verbose", default=False, action='store_true', help="More information (default: %default)") +parser.add_option("-o","--log", dest="logfile", default=None, help="Path to log file (default: %default)") +parser.add_option("-l","--loglevel", dest="loglevel", default='info', type='choice', choices=['debug','warning','info','error','critical'], metavar='LOGLEVEL',help="Loglevel (default: %default)") + +REPORT = re.compile(r'\n(?P\d+)\s+packets transmitted,\s+(?P\d+) received,\s+(?P\d+)%\s+packet loss') +RTT = re.compile(r'rtt min/avg/max/mdev = (?P[0-9.]+)/(?P[0-9.]+)/(?P[0-9.]+)/(?P[0-9.]+) ms') + + +def run(cmd,dry_run=False): + try: + logger.debug(' running {}'.format(cmd)) + if not dry_run: + p = subprocess.check_output(cmd,shell=True,stderr=subprocess.STDOUT) + logger.debug(' output : {}'.format(p)) + return (0,p) + else: + print("DRYRUN : {}".format(cmd)) + return(0,"#### DRYRUN ### no output for {}".format(cmd)) + + except subprocess.CalledProcessError as e: + return (e.returncode,e.output) + + +def default_json(o): + if hasattr(o,'as_dict'): + return o.as_dict() + elif hasattr(o,'as_json'): + return o.as_json() + elif isinstance(o,datetime.datetime): + return o.isoformat() + else: + return u"{}".format(o) + +def jsondumps(o,**kwargs): + """extended json dump of o""" + return json.dumps(o,default=default_json,**kwargs) + +def arping(device,target_ip,ping_count=3): + # arping + #root@gw-pironniere:/opt/check-providers# arping -i eth0 -c4 192.168.149.254 + """\ +ARPING 192.168.149.254 +60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=0 time=229.674 usec +60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=1 time=258.173 usec +60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=2 time=251.468 usec +60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=3 time=252.305 usec + +--- 192.168.149.254 statistics --- +4 packets transmitted, 4 packets received, 0% unanswered (0 extra) + """ + # iputils-arping + #root@gw-pironniere:/opt/check-providers# arping -c2 192.168.149.254 + """\ + ARPING 192.168.149.254 from 192.168.149.184 eth0 + Unicast reply from 192.168.149.254 [00:0A:FA:24:18:F7] 0.886ms + Unicast reply from 192.168.149.254 [00:0A:FA:24:18:F7] 0.777ms + Sent 2 probes (1 broadcast(s)) + """ + ARPING1 = re.compile(r'bytes from (?P\S+).*time=(?P[0-9.]*) (?P.*)') + ARPING2 = re.compile(r'reply from.*\[(?P\S+)\]\s+(?P[0-9.]*)(?P.*)') + + (returncode,output) = run('/usr/sbin/arping -c{ping_count} -i{device} {target_ip}'.format( + ping_count = ping_count, + device = device, + target_ip = target_ip, + )) + + packets = [p.groupdict() for p in ARPING1.finditer(output)] + if not packets: + packets = [p.groupdict() for p in ARPING2.finditer(output)] + result = {} + if packets: + result['mac'] = packets[-1]['mac'] + result['rtt'] = packets[-1]['rtt']+packets[-1]['unit'] + result['alive'] = len(packets)>0 + else: + result['mac'] = None + result['rtt'] = None + result['alive'] = False + return result + +class Provider(object): + def __init__(self,provider_name,device=None,gateway=None,target_ip=None,max_rtt=2000.0,max_loss=30,ping_count=10,ping_interval=0.5,timeout=1.5,led=None): + """Parameters of an Internet provider as defined in Shorewall and availability limits + """ + self.target_ip=target_ip + self.provider_name=provider_name + self.device=device + self.device_type=None + self.device_mac=None + self.last_ip=None + + self._gateway=gateway + + self.gateway_alive = None + self.gateway_rtt = None + self.gateway_mac = None + + self.max_rtt=max_rtt + self.max_loss=max_loss + self.ping_count = ping_count + self.ping_interval = ping_interval + self.timeout = timeout + + self.openvpn_master = 0 + self.fallback = 0 + + self.last_rtt = None + self.last_loss = None + + self._available = None + self._link_states = [] + self._link_status = 'UNKNOWN' + + self.led = led + self.status = '' + self.last_check_time = None + self.last_enabled = None + + self.dry_run = False + + def used_by_openvpn(self,proto='udp',port=1194): + (retcode,output) = run('conntrack -L -p {proto} --dport {port} -o extended | grep "={src}"'.format(proto=proto,src=self.last_ip,port=port)) + """ + ipv4 2 udp 17 178 src=192.168.149.184 dst=80.13.55.10 sport=1194 dport=1194 src=80.13.55.10 dst=192.168.149.184 sport=1194 dport=1194 [ASSURED] mark=1 use=1 + conntrack v1.2.1 (conntrack-tools): 1 flow entries have been shown. + """ + conn = output.splitlines()[:-1] + return conn + + + def read_config(self,config_file): + for attrib in ['target_ip','device','gateway']: + if config_file.has_option(self.provider_name,attrib): + if attrib == 'gateway': + setattr(self,'_gateway',config_file.get(self.provider_name,attrib)) + else: + setattr(self,attrib,config_file.get(self.provider_name,attrib)) + + for attrib in ['max_rtt','timeout','ping_interval']: + if config_file.has_option(self.provider_name,attrib): + setattr(self,attrib,config_file.getfloat(self.provider_name,attrib)) + + for attrib in ['max_loss','ping_count','led','openvpn_master','fallback']: + if config_file.has_option(self.provider_name,attrib): + setattr(self,attrib,config_file.getint(self.provider_name,attrib)) + + @property + def device_up(self): + (retcode,output) = run('ip link show dev {device}'.format(device=self.device)) + """ +4: eth2: mtu 1500 qdisc pfifo_fast master br1 state DOWN mode DEFAULT qlen 1000 +3: eth1: mtu 1500 qdisc pfifo_fast state UNKNOWN mode DEFAULT qlen 1000 +4: eth2: mtu 1500 qdisc pfifo_fast master br1 state DOWN mode DEFAULT qlen 1000 +10: ppp3g: mtu 1500 qdisc pfifo_fast state UNKNOWN mode DEFAULT qlen 3 +6: br1: mtu 1500 qdisc noqueue state UP mode DEFAULT + """ + LINK = re.compile(r':\s+<(?P.+)>.* state (?P.+?)\s') + link = LINK.search(output) + if link: + self._link_states = link.groupdict()['link_states'].split(',') + self._link_status = link.groupdict()['link_status'] + return (self._link_status == 'UP') or ('LOWER_UP' in self._link_states) + else: + return None + + def check_test_route(self): + """Test if there is a route to target_ip through the gateway or interface, and add it if not present""" + if self.target_ip: + (retcode,route) = run('/sbin/ip route show {target_ip}'.format(target_ip=self.target_ip)) + if self.gateway: + if not "{target_ip} via {gateway}".format(target_ip=self.target_ip,gateway=self.gateway) in route: + logger.debug(run('/sbin/ip route del {target_ip}'.format(target_ip=self.target_ip),dry_run=self.dry_run)[1]) + logger.warning('No route for {target_ip} via {gateway}, adding one'.format(target_ip=self.target_ip,gateway=self.gateway)) + logger.debug(run('/sbin/ip route add {target_ip} via {gateway}'.format(target_ip=self.target_ip,gateway=self.gateway),dry_run=self.dry_run)[1]) + elif self.device: + if not " {} ".format(self.device) in route: + logger.warning('No route for {target_ip} through {device}, adding one'.format(target_ip=self.target_ip,device=self.device)) + logger.debug(run('/sbin/ip route add {target_ip} dev {device}'.format(target_ip=self.target_ip,device=self.device),dry_run=self.dry_run)[1]) + else: + logger.critical('No gateway for {target_ip}'.format(target_ip=self.target_ip)) + + def check_gateway(self): + if self.gateway: + result = arping(device=self.device,target_ip=self.gateway) + self.gateway_mac = result['mac'] + self.gateway_rtt = result['rtt'] + self.gateway_alive = result['alive'] + else: + self.gateway_mac = None + self.gateway_rtt = None + self.gateway_alive = None + return self.gateway_alive + + def check_available(self): + """ping the target and change available property based on max_rtt and max_loss + available == True if actual rtt and loss are below the max_rtt and max_loss + """ + self._available = None + self.last_check_time = datetime.datetime.now() + if self.device_up: + self.check_local_ip() + if self.gateway and not self.check_gateway(): + self.status = 'Gateway {} not reachable'.format(self.gateway) + logger.critical('Gateway {} not reachable'.format(self.gateway)) + ping_ip = self.target_ip + if ping_ip: + self.check_test_route() + (returncode,output) = run('/bin/ping -q -n -c{ping_count:n} -W{timeout:n} -i{ping_interval} -I{device} {target_ip}'.format( + ping_count = self.ping_count, + timeout = self.timeout, + device = self.device, + target_ip = ping_ip, + ping_interval=self.ping_interval, + )) + if returncode == 0: + report = REPORT.search(output) + rtt = RTT.search(output) + if report: + self.last_loss = int(report.groupdict()['loss']) + else: + self.last_loss = None + if rtt: + self.last_rtt = float(rtt.groupdict()['avg']) + else: + self.last_rtt = None + + self._available = report and rtt and\ + self.last_loss<=self.max_loss and\ + self.last_rtt<=self.max_rtt + if self._available: + self.status='OK' + elif self.last_loss>self.max_loss: + self.status='Too much loss {}%'.format(self.last_loss) + elif self.last_rtt>self.max_rtt: + self.status='Too long RTT {}ms'.format(self.last_rtt) + else: + self.status = 'ping test failed : {}'.format(output) + else: + self._available = True + else: + self.status = 'Device {} is down or link state is unknown'.format(self.device) + self._available = False + + self.update_leds() + return self._available + + def check_local_ip(self): + """Get local ip of device, set ip, device_mac and device_type""" + (retcode,output) = run('ip addr show dev {device}'.format(device=self.device)) + IPV4ADDR = re.compile(r'\sinet\s+(?P\d+.\d+.\d+.\d+)[/\s]') + MACADDR = re.compile(r'link/(?P\S+)(\s(?P\S+))?') + ipaddr = IPV4ADDR.search(output) + if ipaddr: + self.last_ip = ipaddr.groupdict()['ipv4'] + else: + self.last_ip = None + macaddr = MACADDR.search(output) + if macaddr: + self.device_mac = macaddr.groupdict()['mac'] + self.device_type = macaddr.groupdict()['type'] + else: + self.device_mac = None + self.device_type = None + return self.last_ip + + @property + def gateway(self): + if self._gateway: + #ppp, shorewall notation for no gateway + if self._gateway == '-': + return None + else: + return self._gateway + else: + #from dhcp + (retcode,output) = run('ip route list table {}'.format(self.provider_name)) + """root@htouv:~# ip route list dev eth1 + 88.163.76.0/24 proto kernel scope link src 88.163.76.120 + 88.163.76.254 scope link src 88.163.76.120 + """ + GW = re.compile(r'default via (?P\d+.\d+.\d+.\d+)\s+') + gw = GW.search(output) + if gw: + logger.debug('Gateway : {}'.format(gw.groupdict()['gateway'])) + return gw.groupdict()['gateway'] + else: + logger.debug('No gateway') + return None + + @gateway.setter + def gateway_set(self,value): + self._gateway = value + + @property + def enabled(self): + try: + (retcode,routes) = run('ip route list table {}'.format(self.provider_name)) + if retcode == 0: + routes = routes.splitlines() + self.last_enabled = len(routes)>0 + else: + self.last_enabled = False + return self.last_enabled + except Exception as e: + logger.critical("Unable to get enabled status from routing table: {}".format(e)) + return self.last_enabled + + def led_off(self): + led_path = '/sys/class/leds/alix:{}'.format(self.led) + if os.path.isdir(led_path): + with open(os.path.join(led_path,'brightness'),'wb') as f: + f.write('0') + with open(os.path.join(led_path,'trigger'),'wb') as f: + f.write('none') + + def led_on(self): + led_path = '/sys/class/leds/alix:{}'.format(self.led) + if os.path.isdir(led_path): + with open(os.path.join(led_path,'trigger'),'wb') as f: + f.write('none') + with open(os.path.join(led_path,'brightness'),'wb') as f: + f.write('1') + + def led_blink(self): + led_path = '/sys/class/leds/alix:{}'.format(self.led) + if os.path.isdir(led_path): + with open(os.path.join(led_path,'brightness'),'wb') as f: + f.write('1') + with open(os.path.join(led_path,'trigger'),'wb') as f: + f.write('timer') + + def update_leds(self): + """""" + # /sys/class/leds/alix\:1/trigger + #none backlight default-on [heartbeat] timer + if self.enabled: + if self._available: + self.led_on() + elif self.device_up: + self.led_blink() + else: + self.led_off() + else: + self.led_off() + + def enable(self): + if not self.enabled: + logger.debug('Enable {}'.format(self.provider_name)) + print run('/sbin/shorewall enable {}'.format(self.provider_name),dry_run=self.dry_run) + if self.openvpn_master: + logger.info('Restarting openvpn') + print run('/etc/init.d/openvpn restart',dry_run=self.dry_run) + # here check the connectivity.... else rollback + self.update_leds() + else: + logger.debug('{} already enabled'.format(self.device)) + + def disable(self): + if self.enabled: + openvpn = self.used_by_openvpn() + logger.debug('Disable {}'.format(self.provider_name)) + print run('/sbin/shorewall disable {}'.format(self.provider_name),dry_run=self.dry_run) + # remove connections + if self.last_ip: + logger.info('removing conntrack entries') + logger.debug(run('/usr/sbin/conntrack -D -s {src}'.format(src=self.last_ip))[1],dry_run=self.dry_run) + # be sure there is no default gw in main table so that fallback provider can be reached + self.remove_default_gw() + # restart openvpn if it was running on this provider + if openvpn: + logger.info('openvpn was running here, restarting openvpn') + print run('/etc/init.d/openvpn restart',dry_run=self.dry_run) + self.update_leds() + + def remove_default_gw(self): + """Remove default route which could have been added in main routing table and will prevent fallback interface from taking over""" + (retcode,routes) = run('ip route list table main dev {}'.format(self.device)) + if retcode == 0: + if 'default ' in routes: + print run('ip route del default table main dev {}'.format(self.device),dry_run=self.dry_run) + + def __str__(self): + def get_available(en): + if en is None: + return "UNKNOWN" + elif en: + return "AVAILABLE" + else: + return "UNUSABLE" + + return "Provider {provider} on {device} ip:{local_ip} nh:{gw} (testing IP:{target_ip}) loss:{loss}%,rtt:{rtt}ms {available} ({status})".format( + available=get_available(self._available), + provider=self.provider_name, + device=self.device, + target_ip=self.target_ip, + loss = self.last_loss, + rtt = self.last_rtt, + local_ip = self.last_ip, + gw = self.gateway or "-", + status = self.status, + ) + + def as_dict(self): + return dict( + target_ip = self.target_ip, + provider_name = self.provider_name, + device = self.device, + gateway = self._gateway, + max_rtt = self.max_rtt, + max_loss = self.max_loss, + ping_count = self.ping_count, + ping_interval = self.ping_interval, + ping_timeout = self.timeout, + last_rtt = self.last_rtt, + last_loss = self.last_loss, + available = self._available, + link_states = self._link_states, + link_status = self._link_status, + led = self.led, + status = self.status, + last_check_time = self.last_check_time, + last_ip = self.last_ip, + device_mac = self.device_mac, + device_type = self.device_type, + gateway_alive = self.gateway_alive, + gateway_mac = self.gateway_mac, + gateway_rtt = self.gateway_rtt, + enabled = self.last_enabled, + ) + +def read_config(filename,providers): + cp = RawConfigParser() + cp.read(filename) + + while providers: + provider.pop() + + for provider_name in cp.sections(): + provider = Provider(provider_name) + provider.read_config(cp) + providers.append(provider) + +def is_pid_running(pidfile): + """return pid if pid in pidfile is a running process, remove pidfile if pid is no more running""" + if os.path.isfile(pidfile): + with open(pidfile,'rb') as f: + pid = f.read().strip() + if pid and os.path.isdir("/proc/{}".format(pid)): + return int(pid) + else: + os.unlink(pidfile) + return None + else: + return None + + +def write_pidfile(pidfile,pid=None): + if pid is None: + pid = os.getpid() + oldpid = is_pid_running(pidfile) + if oldpid: + if oldpid <> pid: + raise Exception('There is already a running process {} for the pid file {}'.format(oldpid,pidfile)) + + with open(pidfile,"wb") as f: + f.write(str(pid)) + +def remove_pidfile(pidfile): + if os.path.isfile(pidfile): + os.unlink(pidfile) + +if __name__ == '__main__': + (options,args)=parser.parse_args() + + if len(args) < 1: + print "ERROR : You must provide one action to perform" + parser.print_usage() + sys.exit(2) + + action = args[0] + config_file =options.config + dry_run = options.dry_run + verbose = options.verbose + loglevel = options.loglevel + + monitor_pid_file = '/var/run/check-providers.pid' + current_pid = os.getpid() + + # setup Logger + logger = logging.getLogger() + if options.logfile: + hdlr = logging.FileHandler(filename=options.logfile,encoding='utf8') + hdlr.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) + logger.addHandler(hdlr) + else: + hdlr = logging.StreamHandler() + logger.addHandler(hdlr) + + # set loglevel + if loglevel in ('debug','warning','info','error','critical'): + numeric_level = getattr(logging, loglevel.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError('Invalid log level: %s' % loglevel) + logger.setLevel(numeric_level) + + # Config file + if not os.path.isfile(config_file): + logger.error("Error : could not find file : " + config_file + ", please check the path") + logger.debug("Using " + config_file + " config file") + + #adsl = Provider('ADSL',device='eth0',target_ip='185.16.48.54',gateway='192.168.149.254') + #gsm = Provider('GSM',device='ppp3g',target_ip='185.16.48.55',max_loss=40,max_rtt=2000,ping_count=20) + + providers = [] + read_config(config_file,providers) + + if options.ping_count: + for provider in providers: + provider.ping_count = options.ping_count + + if options.dry_run: + logger.warning('### DRY RUN ### no change to routing or interface state will be performed') + for provider in providers: + provider.dry_run = options.dry_run + + if action == 'stop': + monitor_pid = is_pid_running(monitor_pid_file) + if monitor_pid: + # wakeup current monitor... + logger.info('Sending a TERM signal to running monitor process {}'.format(monitor_pid)) + os.kill(monitor_pid,signal.SIGTERM) + sys.exit(0) + else: + logger.warning('No running monitoring found') + sys.exit(0) + + if action == 'trigger': + monitor_pid = is_pid_running(monitor_pid_file) + if monitor_pid: + # wakeup current monitor... + logger.info('Sending a wakeup signal to running monitor process {}'.format(monitor_pid)) + os.kill(monitor_pid,signal.SIGHUP) + sys.exit(0) + else: + logger.critical('No running monitoring found') + sys.exit(1) + + if action == 'monitor': + monitor_pid = is_pid_running(monitor_pid_file) + if monitor_pid: + # wakeup current monitor... + logger.info('Sending a wakeup signal to running monitor process {}'.format(monitor_pid)) + os.kill(monitor_pid,signal.SIGHUP) + sys.exit(0) + else: + try: + write_pidfile(monitor_pid_file) + + def handler(signum,frame): + global providers + logger.info('Wake up by signal {}'.format(signum)) + if signum == signal.SIGHUP: + logger.info(jsondumps(providers,indent=True)) + elif signum == signal.SIGTERM: + logger.info('Received kill, closing') + remove_pidfile(monitor_pid_file) + sys.exit(0) + + # Set the signal handler and a alarm + signal.signal(signal.SIGALRM, handler) + signal.signal(signal.SIGHUP, handler) + signal.signal(signal.SIGTERM, handler) + + while True: + try: + logger.info('Checking providers {}:'.format(','.join([provider.provider_name for provider in providers]))) + current_ok = [ provider for provider in providers if provider.check_available() ] + for provider in providers: + if provider._available: + if not provider.enabled: + logger.warning("Enabling the available provider {}".format(provider.provider_name)) + provider.enable() + else: + if provider.enabled: + if current_ok and not provider.fallback: + logger.critical("Disabling the provider {} because {}".format(provider.provider_name,provider.status)) + provider.disable() + else: + if not current_ok: + logger.critical("About to disable provider {} but will not because there are no other one".format(provider.provider_name,provider.status)) + else: + logger.critical("Not disabling fallback provider {}".format(provider.provider_name)) + logger.info(' {}'.format(provider)) + signal.alarm(options.check_interval) + signal.pause() + #time.sleep(options.check_interval) + except Exception as e: + logger.critical(e) + #raise + finally: + remove_pidfile(monitor_pid_file) + + elif action == 'check': + if len(args) >= 2: + selproviders = [ provider for provider in providers if provider.provider_name in args[1:]] + else: + selproviders = providers + for provider in selproviders: + print "Checking {}".format(provider.provider_name) + provider.check_available() + print provider + elif action == 'check-json': + result = [] + if len(args) >= 2: + selproviders = [ provider for provider in providers if provider.provider_name in args[1:]] + else: + selproviders = providers + for provider in selproviders: + provider.check_available() + result.append(provider.as_dict()) + print jsondumps(result,indent=True) diff --git a/samples/check-providers.ini b/samples/check-providers.ini new file mode 100644 index 0000000..c6b4653 --- /dev/null +++ b/samples/check-providers.ini @@ -0,0 +1,16 @@ +[ADSL] +device=eth0 +target_ip=185.16.48.54 +gateway=192.168.149.254 +led=2 +openvpn_master=1 + +[GSM] +device=ppp3g +target_ip=185.16.48.55 +max_loss=40 +max_rtt=2000 +ping_count=20 +led=3 +fallback=1 +