[IMP] Support sqlite db to store status and history

This commit is contained in:
2026-04-01 15:25:06 +02:00
parent 9c71e8d493
commit 425080dfec
+175 -133
View File
@@ -18,14 +18,12 @@ import logging
import re import re
import time import time
import datetime import datetime
import json import json
import signal import signal
import sqlite3
from iniparse import RawConfigParser from iniparse import RawConfigParser
from optparse import OptionParser from optparse import OptionParser
#from distutils.spawn import find_executable
usage="""\ usage="""\
%prog -c configfile action %prog -c configfile action
@@ -37,9 +35,10 @@ action is either :
monitor : monitor in background all providers and enable/disable them monitor : monitor in background all providers and enable/disable them
check [all,<provider>] : check all or one provider and display reachability check [all,<provider>] : check all or one provider and display reachability
check-json [all,<provider>] : check providers and output state as json data check-json [all,<provider>] : check providers and output state as json data
status : display current state from state file
""" """
version = "0.0.1" version = "0.0.2"
parser=OptionParser(usage=usage,version="%prog " + version) parser=OptionParser(usage=usage,version="%prog " + version)
parser.add_option("-i","--check-interval", dest="check_interval", type=int, default=60, help="Config file full path (default: %default)") parser.add_option("-i","--check-interval", dest="check_interval", type=int, default=60, help="Config file full path (default: %default)")
@@ -53,6 +52,66 @@ parser.add_option("-l","--loglevel", dest="loglevel", default='info', type='choi
REPORT = re.compile(r'\n(?P<transmitted>\d+)\s+packets transmitted,\s+(?P<received>\d+) received,\s+(?P<loss>\d+)%\s+packet loss') REPORT = re.compile(r'\n(?P<transmitted>\d+)\s+packets transmitted,\s+(?P<received>\d+) received,\s+(?P<loss>\d+)%\s+packet loss')
RTT = re.compile(r'rtt min/avg/max/mdev = (?P<min>[0-9.]+)/(?P<avg>[0-9.]+)/(?P<max>[0-9.]+)/(?P<mdev>[0-9.]+) ms') RTT = re.compile(r'rtt min/avg/max/mdev = (?P<min>[0-9.]+)/(?P<avg>[0-9.]+)/(?P<max>[0-9.]+)/(?P<mdev>[0-9.]+) ms')
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
BASE_DIR = '/opt/check_providers'
DB_PATH = os.path.join(BASE_DIR, 'check-providers.db')
STATE_FILE = os.path.join(BASE_DIR, 'check-providers-state.json')
MONITOR_PID_FILE = os.path.join(BASE_DIR, 'check-providers.pid')
# ---------------------------------------------------------------------------
# Database
# ---------------------------------------------------------------------------
def init_db():
"""Create the SQLite database and events table if not present."""
os.makedirs(BASE_DIR, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.execute('''
CREATE TABLE IF NOT EXISTS events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts TEXT NOT NULL,
provider TEXT NOT NULL,
available INTEGER,
rtt REAL,
loss INTEGER,
status TEXT,
transition INTEGER DEFAULT 0
)
''')
conn.execute('CREATE INDEX IF NOT EXISTS idx_events_provider ON events(provider)')
conn.execute('CREATE INDEX IF NOT EXISTS idx_events_ts ON events(ts)')
conn.commit()
def purge_old_events(days=30):
"""Remove events older than `days` days."""
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"DELETE FROM events WHERE ts < datetime('now', '-{} days')".format(days)
)
conn.commit()
def write_state_file(providers):
"""Write the current state of all providers to the JSON state file."""
tmp = STATE_FILE + '.tmp'
with open(tmp, 'w') as f:
f.write(jsondumps([p.as_dict() for p in providers], indent=True))
os.replace(tmp, STATE_FILE) # atomic replace
def record_providers(providers):
"""Insert one row per provider into the events table."""
with sqlite3.connect(DB_PATH) as conn:
for provider in providers:
provider.record(conn)
conn.commit()
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def run(cmd, dry_run=False): def run(cmd, dry_run=False):
try: try:
@@ -64,7 +123,6 @@ def run(cmd,dry_run=False):
else: else:
print("DRYRUN : {}".format(cmd)) print("DRYRUN : {}".format(cmd))
return (0, "#### DRYRUN ### no output for {}".format(cmd)) return (0, "#### DRYRUN ### no output for {}".format(cmd))
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
return (e.returncode, e.output) return (e.returncode, e.output)
@@ -79,50 +137,25 @@ def default_json(o):
else: else:
return u"{}".format(o) return u"{}".format(o)
def jsondumps(o, **kwargs): def jsondumps(o, **kwargs):
"""extended json dump of o""" """Extended json dump of o."""
return json.dumps(o, default=default_json, **kwargs) return json.dumps(o, default=default_json, **kwargs)
def arping(device,target_ip,ping_count=3):
# arping
#root@gw-pironniere:/opt/check-providers# arping -i eth0 -c4 192.168.149.254
"""\
ARPING 192.168.149.254
60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=0 time=229.674 usec
60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=1 time=258.173 usec
60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=2 time=251.468 usec
60 bytes from 00:0a:fa:24:18:f7 (192.168.149.254): index=3 time=252.305 usec
--- 192.168.149.254 statistics --- def arping(device, target_ip, ping_count=3):
4 packets transmitted, 4 packets received, 0% unanswered (0 extra)
"""
# iputils-arping
#root@gw-pironniere:/opt/check-providers# arping -c2 192.168.149.254
"""\
ARPING 192.168.149.254 from 192.168.149.184 eth0
Unicast reply from 192.168.149.254 [00:0A:FA:24:18:F7] 0.886ms
Unicast reply from 192.168.149.254 [00:0A:FA:24:18:F7] 0.777ms
Sent 2 probes (1 broadcast(s))
"""
ARPING1 = re.compile(r'bytes from (?P<mac>\S+).*time=(?P<rtt>[0-9.]*) (?P<unit>.*)') ARPING1 = re.compile(r'bytes from (?P<mac>\S+).*time=(?P<rtt>[0-9.]*) (?P<unit>.*)')
ARPING2 = re.compile(r'reply from.*\[(?P<mac>\S+)\]\s+(?P<rtt>[0-9.]*)(?P<unit>.*)') ARPING2 = re.compile(r'reply from.*\[(?P<mac>\S+)\]\s+(?P<rtt>[0-9.]*)(?P<unit>.*)')
# ARPING_PATH = find_executable('arping')
ARPING_PATH = "/usr/sbin/arping" ARPING_PATH = "/usr/sbin/arping"
if ARPING_PATH == None: if ARPING_PATH is None:
raise Exception('No arping command found') raise Exception('No arping command found')
elif "/usr/bin/arping" in ARPING_PATH: elif "/usr/bin/arping" in ARPING_PATH:
(returncode, output) = run('arping -c{ping_count} -I{device} {target_ip}'.format( (returncode, output) = run('arping -c{ping_count} -I{device} {target_ip}'.format(
ping_count = ping_count, ping_count=ping_count, device=device, target_ip=target_ip))
device = device,
target_ip = target_ip,
))
packets = [p.groupdict() for p in ARPING2.finditer(output.decode('utf-8'))] packets = [p.groupdict() for p in ARPING2.finditer(output.decode('utf-8'))]
elif "/usr/sbin/arping" in ARPING_PATH: elif "/usr/sbin/arping" in ARPING_PATH:
(returncode, output) = run('arping -c{ping_count} -i{device} {target_ip}'.format( (returncode, output) = run('arping -c{ping_count} -i{device} {target_ip}'.format(
ping_count = ping_count, ping_count=ping_count, device=device, target_ip=target_ip))
device = device,
target_ip = target_ip,
))
packets = [p.groupdict() for p in ARPING1.finditer(output.decode('utf-8'))] packets = [p.groupdict() for p in ARPING1.finditer(output.decode('utf-8'))]
result = {} result = {}
if packets: if packets:
@@ -135,15 +168,9 @@ Sent 2 probes (1 broadcast(s))
result['alive'] = False result['alive'] = False
return result return result
def openvpn_local_sockets(): def openvpn_local_sockets():
"""
Returns:
list of str of IP where openvpn is bound.
"""
(retcode, output) = run("/bin/netstat -lupnw | grep -E '(udp|tcp) .*/openvpn'") (retcode, output) = run("/bin/netstat -lupnw | grep -E '(udp|tcp) .*/openvpn'")
"""
udp 0 0 192.168.1.254:1194 0.0.0.0:* 16919/openvpn
"""
result = [] result = []
listening = output.splitlines() listening = output.splitlines()
for conn in listening: for conn in listening:
@@ -153,14 +180,15 @@ def openvpn_local_sockets():
result.append((proto, local_ip, local_port)) result.append((proto, local_ip, local_port))
return result return result
def delete_conntrack(conn): def delete_conntrack(conn):
"""Remove conntrack entries matching the OpenVPN listening processes"""
for (proto, ip, port) in conn: for (proto, ip, port) in conn:
if ip != '0.0.0.0': if ip != '0.0.0.0':
run('/usr/sbin/conntrack -D -p {proto} -s {src} --sport={port}'.format(src=ip, proto=proto, port=port)) run('/usr/sbin/conntrack -D -p {proto} -s {src} --sport={port}'.format(src=ip, proto=proto, port=port))
else: else:
run('/usr/sbin/conntrack -D -p {proto} --sport={port}'.format(src=ip, proto=proto, port=port)) run('/usr/sbin/conntrack -D -p {proto} --sport={port}'.format(src=ip, proto=proto, port=port))
def restart_openvpn(): def restart_openvpn():
conn = openvpn_local_sockets() conn = openvpn_local_sockets()
print(run('/etc/init.d/openvpn stop')) print(run('/etc/init.d/openvpn stop'))
@@ -169,10 +197,14 @@ def restart_openvpn():
print(run('/etc/init.d/openvpn start')) print(run('/etc/init.d/openvpn start'))
# ---------------------------------------------------------------------------
# Provider
# ---------------------------------------------------------------------------
class Provider(object): class Provider(object):
def __init__(self,provider_name,device=None,gateway=None,target_ip=None,max_rtt=2000.0,max_loss=30,ping_count=10,ping_interval=0.5,timeout=1.5,led=None): def __init__(self, provider_name, device=None, gateway=None, target_ip=None,
"""Parameters of an Internet provider as defined in Shorewall and availability limits max_rtt=2000.0, max_loss=30, ping_count=10, ping_interval=0.5,
""" timeout=1.5, led=None):
self.target_ip = target_ip self.target_ip = target_ip
self.provider_name = provider_name self.provider_name = provider_name
self.device = device self.device = device
@@ -199,6 +231,8 @@ class Provider(object):
self.last_loss = None self.last_loss = None
self._available = None self._available = None
self._previous_available = None # for transition detection
self._state_since = None # datetime of last state change
self._link_states = [] self._link_states = []
self._link_status = 'UNKNOWN' self._link_status = 'UNKNOWN'
@@ -209,12 +243,35 @@ class Provider(object):
self.dry_run = False self.dry_run = False
def record(self, conn):
"""Insert current state into the events table. Marks up<->down transitions."""
transition = int(
self._previous_available != self._available
and self._previous_available is not None
)
if transition:
self._state_since = datetime.datetime.now()
logger.info('Transition detected for {}: {} -> {}'.format(
self.provider_name, self._previous_available, self._available))
conn.execute(
'''INSERT INTO events (ts, provider, available, rtt, loss, status, transition)
VALUES (?, ?, ?, ?, ?, ?, ?)''',
(
datetime.datetime.now().isoformat(),
self.provider_name,
int(self._available) if self._available is not None else None,
self.last_rtt,
self.last_loss,
self.status,
transition,
)
)
self._previous_available = self._available
def used_by_openvpn(self, proto='udp', port=1194): def used_by_openvpn(self, proto='udp', port=1194):
(retcode,output) = run('conntrack -L -p {proto} --dport {port} -o extended | grep "={src}"'.format(proto=proto,src=self.last_ip,port=port)) (retcode, output) = run('conntrack -L -p {proto} --dport {port} -o extended | grep "={src}"'.format(
""" proto=proto, src=self.last_ip, port=port))
conntrack v1.2.1 (conntrack-tools): 1 flow entries have been shown.
ipv4 2 udp 17 178 src=192.168.149.184 dst=80.13.55.10 sport=1194 dport=1194 src=80.13.55.10 dst=192.168.149.184 sport=1194 dport=1194 [ASSURED] mark=1 use=1
"""
conn = output.splitlines() conn = output.splitlines()
for c in conn: for c in conn:
if "={src} ".format(src=self.last_ip) in c.decode('utf-8'): if "={src} ".format(src=self.last_ip) in c.decode('utf-8'):
@@ -240,14 +297,6 @@ ipv4 2 udp 17 178 src=192.168.149.184 dst=80.13.55.10 sport=1194 dport=1194 src=
@property @property
def device_up(self): def device_up(self):
(retcode, output) = run('ip link show dev {device}'.format(device=self.device)) (retcode, output) = run('ip link show dev {device}'.format(device=self.device))
"""
4: eth2: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast master br1 state DOWN mode DEFAULT qlen 1000
3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN mode DEFAULT qlen 1000
4: eth2: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast master br1 state DOWN mode DEFAULT qlen 1000
10: ppp3g: <POINTOPOINT,MULTICAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UNKNOWN mode DEFAULT qlen 3
6: br1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT
"""
LINK = re.compile(r':\s+<(?P<link_states>.+)>.* state (?P<link_status>.+?)\s') LINK = re.compile(r':\s+<(?P<link_states>.+)>.* state (?P<link_status>.+?)\s')
link = LINK.search(output.decode('utf-8')) link = LINK.search(output.decode('utf-8'))
if link: if link:
@@ -258,18 +307,21 @@ ipv4 2 udp 17 178 src=192.168.149.184 dst=80.13.55.10 sport=1194 dport=1194 src=
return None return None
def check_test_route(self): def check_test_route(self):
"""Test if there is a route to target_ip through the gateway or interface, and add it if not present"""
if self.target_ip: if self.target_ip:
(retcode, route) = run('/sbin/ip route show {target_ip}'.format(target_ip=self.target_ip)) (retcode, route) = run('/sbin/ip route show {target_ip}'.format(target_ip=self.target_ip))
if self.gateway: if self.gateway:
if not "{target_ip} via {gateway}".format(target_ip=self.target_ip, gateway=self.gateway) in route.decode('utf-8'): if not "{target_ip} via {gateway}".format(target_ip=self.target_ip, gateway=self.gateway) in route.decode('utf-8'):
logger.debug(run('/sbin/ip route del {target_ip}'.format(target_ip=self.target_ip), dry_run=self.dry_run)[1]) logger.debug(run('/sbin/ip route del {target_ip}'.format(target_ip=self.target_ip), dry_run=self.dry_run)[1])
logger.warning('No route for {target_ip} via {gateway}, adding one'.format(target_ip=self.target_ip,gateway=self.gateway)) logger.warning('No route for {target_ip} via {gateway}, adding one'.format(
logger.debug(run('/sbin/ip route add {target_ip} via {gateway}'.format(target_ip=self.target_ip,gateway=self.gateway),dry_run=self.dry_run)[1]) target_ip=self.target_ip, gateway=self.gateway))
logger.debug(run('/sbin/ip route add {target_ip} via {gateway}'.format(
target_ip=self.target_ip, gateway=self.gateway), dry_run=self.dry_run)[1])
elif self.device: elif self.device:
if not " {} ".format(self.device) in route.decode('utf-8'): if not " {} ".format(self.device) in route.decode('utf-8'):
logger.warning('No route for {target_ip} through {device}, adding one'.format(target_ip=self.target_ip,device=self.device)) logger.warning('No route for {target_ip} through {device}, adding one'.format(
logger.debug(run('/sbin/ip route add {target_ip} dev {device}'.format(target_ip=self.target_ip,device=self.device),dry_run=self.dry_run)[1]) target_ip=self.target_ip, device=self.device))
logger.debug(run('/sbin/ip route add {target_ip} dev {device}'.format(
target_ip=self.target_ip, device=self.device), dry_run=self.dry_run)[1])
else: else:
logger.critical('No gateway for {target_ip}'.format(target_ip=self.target_ip)) logger.critical('No gateway for {target_ip}'.format(target_ip=self.target_ip))
@@ -286,9 +338,6 @@ ipv4 2 udp 17 178 src=192.168.149.184 dst=80.13.55.10 sport=1194 dport=1194 src=
return self.gateway_alive return self.gateway_alive
def check_available(self): def check_available(self):
"""ping the target and change available property based on max_rtt and max_loss
available == True if actual rtt and loss are below the max_rtt and max_loss
"""
self._available = None self._available = None
self.last_check_time = datetime.datetime.now() self.last_check_time = datetime.datetime.now()
if self.device_up: if self.device_up:
@@ -339,7 +388,6 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
return self._available return self._available
def check_local_ip(self): def check_local_ip(self):
"""Get local ip of device, set ip, device_mac and device_type"""
(retcode, output) = run('ip addr show dev {device}'.format(device=self.device)) (retcode, output) = run('ip addr show dev {device}'.format(device=self.device))
IPV4ADDR = re.compile(r'\sinet\s+(?P<ipv4>\d+.\d+.\d+.\d+)[/\s]') IPV4ADDR = re.compile(r'\sinet\s+(?P<ipv4>\d+.\d+.\d+.\d+)[/\s]')
MACADDR = re.compile(r'link/(?P<type>\S+)(\s(?P<mac>\S+))?') MACADDR = re.compile(r'link/(?P<type>\S+)(\s(?P<mac>\S+))?')
@@ -360,18 +408,12 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
@property @property
def gateway(self): def gateway(self):
if self._gateway: if self._gateway:
#ppp, shorewall notation for no gateway
if self._gateway == '-': if self._gateway == '-':
return None return None
else: else:
return self._gateway return self._gateway
else: else:
#from dhcp
(retcode, output) = run('ip route list table {}'.format(self.provider_name)) (retcode, output) = run('ip route list table {}'.format(self.provider_name))
"""root@htouv:~# ip route list dev eth1
88.163.76.0/24 proto kernel scope link src 88.163.76.120
88.163.76.254 scope link src 88.163.76.120
"""
GW = re.compile(r'default via (?P<gateway>\d+.\d+.\d+.\d+)\s+') GW = re.compile(r'default via (?P<gateway>\d+.\d+.\d+.\d+)\s+')
gw = GW.search(str(output)) gw = GW.search(str(output))
if gw: if gw:
@@ -400,8 +442,6 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
return self.last_enabled return self.last_enabled
def led_off(self): def led_off(self):
# led_path = '/sys/class/leds/alix:{}'.format(self.led)
print('led off')
led_path = r'/sys/class/leds/apu:green:{}'.format(self.led) led_path = r'/sys/class/leds/apu:green:{}'.format(self.led)
if os.path.isdir(led_path): if os.path.isdir(led_path):
with open(os.path.join(led_path, 'brightness'), 'wb') as f: with open(os.path.join(led_path, 'brightness'), 'wb') as f:
@@ -410,7 +450,6 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
f.write(bytes('none', encoding='utf-8')) f.write(bytes('none', encoding='utf-8'))
def led_on(self): def led_on(self):
# led_path = '/sys/class/leds/alix:{}'.format(self.led)
led_path = r'/sys/class/leds/apu:green:{}'.format(self.led) led_path = r'/sys/class/leds/apu:green:{}'.format(self.led)
if os.path.isdir(led_path): if os.path.isdir(led_path):
with open(os.path.join(led_path, 'trigger'), 'wb') as f: with open(os.path.join(led_path, 'trigger'), 'wb') as f:
@@ -419,8 +458,6 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
f.write(bytes('1', encoding='utf-8')) f.write(bytes('1', encoding='utf-8'))
def led_blink(self): def led_blink(self):
# led_path = '/sys/class/leds/alix:{}'.format(self.led)
print('led blink')
led_path = r'/sys/class/leds/apu:green:{}'.format(self.led) led_path = r'/sys/class/leds/apu:green:{}'.format(self.led)
if os.path.isdir(led_path): if os.path.isdir(led_path):
with open(os.path.join(led_path, 'brightness'), 'wb') as f: with open(os.path.join(led_path, 'brightness'), 'wb') as f:
@@ -429,14 +466,9 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
f.write(bytes('heartbeat', encoding='utf-8')) f.write(bytes('heartbeat', encoding='utf-8'))
def update_leds(self): def update_leds(self):
""""""
# /sys/class/leds/alix\:1/trigger
#none backlight default-on [heartbeat] timer
if self.enabled: if self.enabled:
if self._available: if self._available:
self.led_on() self.led_on()
elif self.device_up:
self.led_off()
else: else:
self.led_off() self.led_off()
else: else:
@@ -450,7 +482,6 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
except Exception as e: except Exception as e:
logger.info('Retrying to disable/enable provider because %s' % e) logger.info('Retrying to disable/enable provider because %s' % e)
print(run('/var/lib/shorewall/firewall restart', dry_run=self.dry_run)) print(run('/var/lib/shorewall/firewall restart', dry_run=self.dry_run))
# here check the connectivity.... else rollback
self.update_leds() self.update_leds()
print('Routes after enabling provider %s\n%s' % (self.provider_name, run('/sbin/shorewall show routing'))) print('Routes after enabling provider %s\n%s' % (self.provider_name, run('/sbin/shorewall show routing')))
else: else:
@@ -460,19 +491,15 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
if self.enabled: if self.enabled:
openvpn = self.used_by_openvpn() openvpn = self.used_by_openvpn()
logger.debug('Disable {}'.format(self.provider_name)) logger.debug('Disable {}'.format(self.provider_name))
# restart openvpn if it was running on this provider
if openvpn: if openvpn:
logger.info('openvpn was running here, stopping openvpn') logger.info('openvpn was running here, stopping openvpn')
print(run('/etc/init.d/openvpn stop', dry_run=self.dry_run)) print(run('/etc/init.d/openvpn stop', dry_run=self.dry_run))
print(run('/var/lib/shorewall/firewall disable {}'.format(self.provider_name), dry_run=self.dry_run)) print(run('/var/lib/shorewall/firewall disable {}'.format(self.provider_name), dry_run=self.dry_run))
# remove connections
if self.last_ip: if self.last_ip:
logger.info('removing conntrack entries') logger.info('removing conntrack entries')
logger.info(run('/usr/sbin/conntrack -D -s {src}'.format(src=self.last_ip), dry_run=self.dry_run)[1]) logger.info(run('/usr/sbin/conntrack -D -s {src}'.format(src=self.last_ip), dry_run=self.dry_run)[1])
logger.info(run('/usr/sbin/conntrack -D -q {src}'.format(src=self.last_ip), dry_run=self.dry_run)[1]) logger.info(run('/usr/sbin/conntrack -D -q {src}'.format(src=self.last_ip), dry_run=self.dry_run)[1])
# be sure there is no default gw in main table so that fallback provider can be reached
self.remove_default_gw() self.remove_default_gw()
# restart openvpn if it was running on this provider
if openvpn: if openvpn:
logger.info('openvpn was running here, restarting openvpn') logger.info('openvpn was running here, restarting openvpn')
print(run('/etc/init.d/openvpn start', dry_run=self.dry_run)) print(run('/etc/init.d/openvpn start', dry_run=self.dry_run))
@@ -480,7 +507,6 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
print('Routes after provider %s disabling\n%s' % (self.provider_name, run('/sbin/shorewall show routing'))) print('Routes after provider %s disabling\n%s' % (self.provider_name, run('/sbin/shorewall show routing')))
def remove_default_gw(self): def remove_default_gw(self):
"""Remove default route which could have been added in main routing table and will prevent fallback interface from taking over"""
(retcode, routes) = run('ip route list table main dev {}'.format(self.device)) (retcode, routes) = run('ip route list table main dev {}'.format(self.device))
if retcode == 0: if retcode == 0:
if 'default ' in str(routes): if 'default ' in str(routes):
@@ -533,26 +559,30 @@ available == True if actual rtt and loss are below the max_rtt and max_loss
gateway_mac=self.gateway_mac, gateway_mac=self.gateway_mac,
gateway_rtt=self.gateway_rtt, gateway_rtt=self.gateway_rtt,
enabled=self.last_enabled, enabled=self.last_enabled,
state_since=self._state_since,
) )
# ---------------------------------------------------------------------------
# Config / pid helpers
# ---------------------------------------------------------------------------
def read_config(filename, providers): def read_config(filename, providers):
cp = RawConfigParser() cp = RawConfigParser()
cp.read(filename) cp.read(filename)
while providers: while providers:
providers.pop() providers.pop()
for provider_name in cp.sections(): for provider_name in cp.sections():
provider = Provider(provider_name) provider = Provider(provider_name)
provider.read_config(cp) provider.read_config(cp)
providers.append(provider) providers.append(provider)
def is_pid_running(pidfile): def is_pid_running(pidfile):
"""return pid if pid in pidfile is a running process, remove pidfile if pid is no more running"""
if os.path.isfile(pidfile): if os.path.isfile(pidfile):
with open(pidfile, 'rb') as f: with open(pidfile, 'rb') as f:
pid = f.read().strip() pid = f.read().strip()
if pid and os.path.isdir("/proc/{}".format(pid)): if pid and os.path.isdir("/proc/{}".format(pid.decode())):
return int(pid) return int(pid)
else: else:
os.unlink(pidfile) os.unlink(pidfile)
@@ -565,19 +595,22 @@ def write_pidfile(pidfile,pid=None):
if pid is None: if pid is None:
pid = os.getpid() pid = os.getpid()
oldpid = is_pid_running(pidfile) oldpid = is_pid_running(pidfile)
if oldpid: if oldpid and oldpid != pid:
# if oldpid <> pid:
if oldpid != pid:
raise Exception('There is already a running process {} for the pid file {}'.format(oldpid, pidfile)) raise Exception('There is already a running process {} for the pid file {}'.format(oldpid, pidfile))
os.makedirs(os.path.dirname(pidfile), exist_ok=True)
with open(pidfile, "wb") as f: with open(pidfile, "wb") as f:
print(pid) f.write(bytes(str(pid), 'utf-8'))
f.write(bytes(pid))
def remove_pidfile(pidfile): def remove_pidfile(pidfile):
if os.path.isfile(pidfile): if os.path.isfile(pidfile):
os.unlink(pidfile) os.unlink(pidfile)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
if __name__ == '__main__': if __name__ == '__main__':
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
@@ -592,7 +625,7 @@ if __name__ == '__main__':
verbose = options.verbose verbose = options.verbose
loglevel = options.loglevel loglevel = options.loglevel
monitor_pid_file = '/var/run/check-providers.pid' monitor_pid_file = MONITOR_PID_FILE
current_pid = os.getpid() current_pid = os.getpid()
# setup Logger # setup Logger
@@ -605,21 +638,16 @@ if __name__ == '__main__':
hdlr = logging.StreamHandler() hdlr = logging.StreamHandler()
logger.addHandler(hdlr) logger.addHandler(hdlr)
# set loglevel
if loglevel in ('debug', 'warning', 'info', 'error', 'critical'): if loglevel in ('debug', 'warning', 'info', 'error', 'critical'):
numeric_level = getattr(logging, loglevel.upper(), None) numeric_level = getattr(logging, loglevel.upper(), None)
if not isinstance(numeric_level, int): if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % loglevel) raise ValueError('Invalid log level: %s' % loglevel)
logger.setLevel(numeric_level) logger.setLevel(numeric_level)
# Config file
if not os.path.isfile(config_file): if not os.path.isfile(config_file):
logger.error("Error : could not find file : " + config_file + ", please check the path") logger.error("Error : could not find file : " + config_file + ", please check the path")
logger.debug("Using " + config_file + " config file") logger.debug("Using " + config_file + " config file")
#adsl = Provider('ADSL',device='eth0',target_ip='185.16.48.54',gateway='192.168.149.254')
#gsm = Provider('GSM',device='ppp3g',target_ip='185.16.48.55',max_loss=40,max_rtt=2000,ping_count=20)
providers = [] providers = []
read_config(config_file, providers) read_config(config_file, providers)
@@ -632,10 +660,13 @@ if __name__ == '__main__':
for provider in providers: for provider in providers:
provider.dry_run = options.dry_run provider.dry_run = options.dry_run
# -----------------------------------------------------------------------
# Actions
# -----------------------------------------------------------------------
if action == 'stop': if action == 'stop':
monitor_pid = is_pid_running(monitor_pid_file) monitor_pid = is_pid_running(monitor_pid_file)
if monitor_pid: if monitor_pid:
# wakeup current monitor...
logger.info('Sending a TERM signal to running monitor process {}'.format(monitor_pid)) logger.info('Sending a TERM signal to running monitor process {}'.format(monitor_pid))
os.kill(monitor_pid, signal.SIGTERM) os.kill(monitor_pid, signal.SIGTERM)
sys.exit(0) sys.exit(0)
@@ -643,10 +674,9 @@ if __name__ == '__main__':
logger.warning('No running monitoring found') logger.warning('No running monitoring found')
sys.exit(0) sys.exit(0)
if action == 'trigger': elif action == 'trigger':
monitor_pid = is_pid_running(monitor_pid_file) monitor_pid = is_pid_running(monitor_pid_file)
if monitor_pid: if monitor_pid:
# wakeup current monitor...
logger.info('Sending a wakeup signal to running monitor process {}'.format(monitor_pid)) logger.info('Sending a wakeup signal to running monitor process {}'.format(monitor_pid))
os.kill(monitor_pid, signal.SIGHUP) os.kill(monitor_pid, signal.SIGHUP)
sys.exit(0) sys.exit(0)
@@ -654,41 +684,54 @@ if __name__ == '__main__':
logger.critical('No running monitoring found') logger.critical('No running monitoring found')
sys.exit(1) sys.exit(1)
if action == 'monitor': elif action == 'status':
try:
with open(STATE_FILE) as f:
print(f.read())
except FileNotFoundError:
print(jsondumps({'error': 'No state file found, is monitor running?'}))
sys.exit(0)
elif action == 'monitor':
monitor_pid = is_pid_running(monitor_pid_file) monitor_pid = is_pid_running(monitor_pid_file)
if monitor_pid: if monitor_pid:
# wakeup current monitor...
logger.info('Sending a wakeup signal to running monitor process {}'.format(monitor_pid)) logger.info('Sending a wakeup signal to running monitor process {}'.format(monitor_pid))
os.kill(monitor_pid, signal.SIGHUP) os.kill(monitor_pid, signal.SIGHUP)
sys.exit(0) sys.exit(0)
else: else:
init_db()
try: try:
write_pidfile(monitor_pid_file) write_pidfile(monitor_pid_file)
cycle_count = 0
def handler(signum, frame): def handler(signum, frame):
global providers global providers
logger.info('Wake up by signal {}'.format(signum)) logger.info('Wake up by signal {}'.format(signum))
if signum == signal.SIGHUP: if signum == signal.SIGHUP:
logger.info(jsondumps(providers, indent=True)) logger.info(jsondumps(providers, indent=True))
elif signum == signal.SIGUSR1:
write_state_file(providers)
logger.info('State file updated on SIGUSR1')
elif signum == signal.SIGTERM: elif signum == signal.SIGTERM:
logger.info('Received kill, closing') logger.info('Received kill, closing')
remove_pidfile(monitor_pid_file) remove_pidfile(monitor_pid_file)
sys.exit(0) sys.exit(0)
# Set the signal handler and a alarm
signal.signal(signal.SIGALRM, handler) signal.signal(signal.SIGALRM, handler)
signal.signal(signal.SIGHUP, handler) signal.signal(signal.SIGHUP, handler)
signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGUSR1, handler)
while True: while True:
try: try:
logger.info('Checking providers {}:'.format(','.join([provider.provider_name for provider in providers]))) cycle_count += 1
logger.info('Checking providers {}:'.format(
','.join([provider.provider_name for provider in providers])))
current_ok = [provider for provider in providers if provider.check_available()] current_ok = [provider for provider in providers if provider.check_available()]
# list of providers which are used by openvpn
openvpn_prov = [provider for provider in providers if provider.used_by_openvpn()] openvpn_prov = [provider for provider in providers if provider.used_by_openvpn()]
shorewall_restart_needed = False shorewall_restart_needed = False
for provider in providers: for provider in providers:
# we will check if a workable provider needs to be enabled by shorewall
if provider._available: if provider._available:
if not provider.enabled: if not provider.enabled:
logger.warning("Enabling the available provider {}".format(provider.provider_name)) logger.warning("Enabling the available provider {}".format(provider.provider_name))
@@ -696,18 +739,8 @@ if __name__ == '__main__':
run('/usr/sbin/conntrack -F') run('/usr/sbin/conntrack -F')
if provider.openvpn_master: if provider.openvpn_master:
restart_openvpn() restart_openvpn()
# todo : check balance routing table. If an interface involved in default route is removed (ppp or tun)
# the entire default route entry is removed by the kernel.
# so if we can't find a route which refer to it in balance table, trigger a restart of shorewall to cleanup the situation...
if not shorewall_restart_needed and not provider.fallback: if not shorewall_restart_needed and not provider.fallback:
(retcode, output) = run('ip route show table balance') (retcode, output) = run('ip route show table balance')
"""
default
nexthop via 185.16.51.9 realm 3 dev eth1 weight 1
nexthop dev tun2 weight 1
"""
balance = str(output).splitlines() balance = str(output).splitlines()
in_balance = False in_balance = False
for l in balance: for l in balance:
@@ -716,30 +749,37 @@ if __name__ == '__main__':
break break
if not in_balance: if not in_balance:
shorewall_restart_needed = True shorewall_restart_needed = True
logger.critical("Shorewall restart needed because provider {} is not in default balance route ".format(provider.provider_name)) logger.critical("Shorewall restart needed because provider {} is not in default balance route".format(
provider.provider_name))
run('/usr/sbin/shorewall restart && /usr/sbin/conntrack -F') run('/usr/sbin/shorewall restart && /usr/sbin/conntrack -F')
else: else:
if provider.enabled: if provider.enabled:
if current_ok and not provider.fallback: if current_ok and not provider.fallback:
logger.critical("Disabling the provider {} because {}".format(provider.provider_name,provider.status)) logger.critical("Disabling the provider {} because {}".format(
provider.provider_name, provider.status))
provider.disable() provider.disable()
else: else:
if not current_ok: if not current_ok:
logger.critical("About to disable provider {} but will not because there are no other one".format(provider.provider_name)) logger.critical("About to disable provider {} but will not because there are no other one".format(
provider.provider_name))
else: else:
logger.critical("Not disabling fallback provider {}".format(provider.provider_name)) logger.critical("Not disabling fallback provider {}".format(provider.provider_name))
logger.info(' {}'.format(provider)) logger.info(' {}'.format(provider))
# Persist state and history
write_state_file(providers)
record_providers(providers)
# Purge old events once every 100 cycles (~every 100 min with default interval)
if cycle_count % 100 == 0:
purge_old_events(days=30)
signal.alarm(options.check_interval) signal.alarm(options.check_interval)
signal.pause() signal.pause()
#time.sleep(options.check_interval)
except Exception as e: except Exception as e:
logger.critical(e) logger.critical(e)
#raise
finally: finally:
remove_pidfile(monitor_pid_file) remove_pidfile(monitor_pid_file)
@@ -754,6 +794,7 @@ if __name__ == '__main__':
print(provider) print(provider)
if provider.used_by_openvpn(): if provider.used_by_openvpn():
print("This provider is used by Openvpn") print("This provider is used by Openvpn")
elif action == 'check-json': elif action == 'check-json':
result = [] result = []
if len(args) >= 2: if len(args) >= 2:
@@ -764,3 +805,4 @@ if __name__ == '__main__':
provider.check_available() provider.check_available()
result.append(provider.as_dict()) result.append(provider.as_dict())
print(jsondumps(result, indent=True)) print(jsondumps(result, indent=True))