#! /bin/bash

# Copyright 2016, 2019 Cumulus Networks, Inc. All rights reserved.

# This script is run from systemd via OnFailure actions for
# cumulus services for which we want to run cl-support

# This argument should be started with %i, to get the name of the
# restarted service.

# At the time of this writing (Jan 2016), that's switchd and clagd

# we don't want to do anything if a service exits normal, such
# as on systemctl stop, or an exit status marked as OK in the unit file.
[ "$SERVICE_RESULT" = success ] && exit 0

service=$1 # base name, without .service at the end
prog=${1%.service}
# watchdog.prog so programs that do 'rm -f /run/prog.*' don't catch it
heartbeatmiss=/run/watchdog.${prog}
failure=/run/failure.${prog}

# capture the state once, as soon as possible.
stat="$(systemctl show -p Result,ExecMainCode,ExecMainStatus,UnitFileState $service \
    | tr '\n' ' ')"
# We don't want the UnitFileState field in the log message
statmsg=${stat% UnitFileState=*}
systemdlib=/lib/systemd/system

# Check to see if the service file (only in /lib/systemd/system) is newer
# than the status file.  If so, the package has been updated since the last
# instance of failure, so we want to treat it as a new problem
function chkservicenewer()
{
    if [ -e ${systemdlib}/${service}.service -a -e "$1" ]; then
        [ ${systemdlib}/${service}.service -nt "$1" ] && return 0
    fi
    return 1
}

function handle_watchdog()
{
        local -i miss=0
        if [ -e ${heartbeatmiss} ] ; then
           if ! chkservicenewer ${heartbeatmiss} ; then
               . ${heartbeatmiss}
           fi
        fi
        (( miss++ ))
        echo miss=${miss} > ${heartbeatmiss}
        mods=system,network
        case "$prog" in
            clagd) mods=$mods,clag ;;
            frr|switchd) mods=$mods,$prog ;;
        esac
        if [ ${miss} -ne 1 ]; then
            logger -p err -t heartbeat Restarting $prog after heartbeat miss '#'${miss} without cl-support
        else
            logger -p err -t heartbeat $prog heartbeat miss '#'${miss} taking cl-support
            /usr/cumulus/bin/cl-support -r "$prog first heartbeat miss" -e $mods
        fi
}

function log_failure()
{
    local -i fails=0
    if [ -e ${failure} ] ; then
        if ! chkservicenewer ${failure}; then
            . ${failure}
        fi
    fi
    (( fails++ ))
    echo fails=${fails} > ${failure}
    if [ ${fails} -ne 1 ]; then
        logger -p err -t Failure Not taking cl-support, $prog failure '#'${fails} status: "$statmsg"
    else
        logger -p err -t Failure $prog Failed, taking cl-support
        /usr/cumulus/bin/cl-support -r "$prog first failure"
    fi
}


case "$stat" in
    *Result=watchdog*)
	handle_watchdog
	;;
    *UnitFileState=disabled*)
	 ;; # ignore disabled services
    *ActiveState=inactive*)
	 ;; # ignore inactive (stopped, never started) services
    *'Result=signal '* | 'Result=exit-code'* )
	# We need this case for the case when the program missing
	# the heartbeat was so stuck that systemd tried SIGTERM, but
	# the program still didn't exit, and therefore systemd used
	# SIGKILL.   When that happens, the Result is no longer "watchdog"
	# We can see other status's also, if systemd sends the SIGTERM,
	# and the process handles it and exits, or just dies.
	#
	# So then we always look at the journalctl output for the last 2 minutes
	# and if we see a watchdog reported for the failing service, we
	# assume it did watchdog, but then was killed with SIGKILL.
	# We don't want to do this for all SIGKILL, in case somebody
	# does something like 'killall -KILL switchd'
	# Beware; if you have set -x in this script, the egrep will always work...
	iswatch=$(journalctl -l -o short --since='2 minutes ago'  |
		 egrep "${service} watchdog.timeout")
	if [ -n "$iswatch" ]; then
		handle_watchdog
	else
		log_failure $service
	fi
	;;
    *)
	log_failure $service
        ;;
esac
