#! /bin/bash
# Copyright 2016, 2017, Cumulus Networks, Inc.  All rights reserved.

# This script is designed to run from a systemd service, which
# in turn is run from a systemd timer.  Therefore any state needs
# to reside in the filesystem.

# It implement system monitoring for filesystem, memory, and cpu use
# and reports issues to syslog

svc=sysmonitor
svcdir=/run/${svc}

# default values, in case config file not present
# or no settings

# this can be set in sysmonitor.conf, but isn't documented
# there.  Normally no debugs are in this script, but it can
# help development and customer debug
typeset -i debug=0

# root filesytem usage (df --output=pcent /)
typeset -i diskalarm=90
typeset -i diskcrit=95
# system memory usage (memory used / total memory) * 100
typeset -i memalarm=90
typeset -i memcrit=95
# total cpu utilization over all cores; as a percent
typeset -i cpualarm=80
typeset -i cpucrit=95
# (5 minute load avg / number of cores) * 100
typeset -i loadalarm=95
typeset -i loadcrit=125

[ -d $svcdir ] || mkdir -p $svcdir

# read limits, if file is present
[ -f /etc/cumulus/${svc}.conf ] && . /etc/cumulus/${svc}.conf

# after sysmonitor and before status, so customer errors in
# config file don't affect correctness

typeset -i prevdiskalarm=0
typeset -i prevdiskcrit=0
typeset -i prevmemalarm=0
typeset -i prevmemcrit=0
typeset -i prevcpualarm=0
typeset -i prevcpucrit=0
typeset -i prevloadalarm=0
typeset -i prevloadcrit=0

typeset -i ncores=0

# for system cpu usage
typeset -i prevuptime=0
typeset -i previdletime=0


runstatus=${svcdir}/status
historylog=${svcdir}/history

# read previous status
[ -f ${runstatus} ] && . ${runstatus}

function log()
{
    local pri=$1; shift
    logger -t ${svc} -p $pri -- $@
}

function debug()
{
    [ ${debug} -gt 0 ] && logger -s -t ${svc} -p debug -- $@
}

function savestate()
{
    local f vars

    vars="prevdiskalarm prevdiskcrit prevmemalarm prevmemcrit prevcpualarm prevcpucrit
     prevloadalarm prevloadcrit ncores prevuptime previdletime"
    for f in ${vars}; do
       echo ${f}'='${!f}
    done > ${runstatus}.tmp && mv ${runstatus}.tmp ${runstatus}
}

# Keep a running log of state for debugging problems.  Not in syslog
# to avoid cluttering up logs, and to reduce disk writes
# log with nanosecond resolution to more closely correlate
# with syslog's, even though we gather the information over
# a reasonable fraction of a second.
function loghistory()
{
    date +"%FT%T.%N ${memavail} ${rootfs} ${loadaverage} ${cpu_util}" \
        >> ${historylog}
}

# check (only) root filesystem for being overly full
# don't complain repeatedly if there is not a significant status
# change
function chkrootfs()
{
    local -i rootpct

    rootpct=$(df --output=pcent / | sed -n '$s/%//p')
    rootfs="rootfs:${rootpct}%" # for loghistory
    if [ ${rootpct} -ge ${diskcrit} ]; then
	if [ ${rootpct} -gt ${prevdiskcrit} ]; then
	    log crit / filesystem critically full: ${rootpct}'%' in use
	fi
	prevdiskcrit=$((rootpct+2)) prevdiskalarm=
    elif [ ${rootpct} -ge ${diskalarm} ]; then
	if [ ${rootpct} -gt ${prevdiskalarm} ]; then
	    log alert / filesystem nearly full: ${rootpct}'%' in use
	fi
	prevdiskalarm=$((rootpct+5)) prevdiskcrit=
    else
       if [ ${prevdiskcrit} -gt 0 ]; then
	    log notice / filesystem no longer critically full: ${rootpct}'%' in use
       elif [ ${prevdiskalarm} -gt 0 ]; then
	    log notice / filesystem no longer nearly full: ${rootpct}'%' in use
       fi
       prevdiskalarm=  prevdiskcrit=
    fi
}

# system memory usage (memory used / total memory) * 100
function chkmem()
{
   local -i memtot=0 memfree=0 val pctused pctfree mret=0
   local field unit

   while read field val unit; do
	case $field in
	MemTotal*) memtot=$val ;;
	MemAvailable*)  memfree=$val ;;
	esac
	[ $memtot -gt 0 -a $memfree -gt 0 ] && break
   done < /proc/meminfo

   ((pctused = (($memtot - $memfree)*100) / $memtot ))
   ((pctfree = 100 - $pctused ))
   memavail="memavail:${memfree}KB,${pctfree}%" # for loghistory

    if [ ${pctused} -ge ${memcrit} ]; then
	if [ ${pctused} -gt ${prevmemcrit} ]; then
	    log crit Critically low free memory: ${pctused}'%' in use
	    mret=1
	fi
	prevmemcrit=$((pctused+2)) prevmemalarm=
    elif [ ${pctused} -ge ${memalarm} ]; then
	if [ ${pctused} -gt ${prevmemalarm} ]; then
	    log alert Low free memory: ${pctused}'%' in use
	    mret=1
	fi
    prevmemalarm=$((pctused+5)) prevmemcrit=
    else
       if [ ${prevmemcrit} -gt 0 ]; then
	    log notice Free memory no longer critically low: ${pctused}'%' in use
       elif [ ${prevmemalarm} -gt 0 ]; then
	    log notice Free memory no longer low: ${pctused}'%' in use
       fi
       prevmemalarm=  prevmemcrit=
    fi
    return $mret
}

# (5 minute load avg / number of cores) * 100
# return 1 if we went over one of the thresholds
function chkload()
{
   local -i min fiv fteen ret=0
   local mind fivd fteend dispavgd updispavgd # fractional part not declared
	    # numeric because it can have a leading 0, and don't want read
        # to get an error
   local -i dispavg updispavg
   local rest avg upavg

   # we need to determine number of cores if we don't already have it
   [ $ncores -eq 0 ] && ncores=$(lscpu | grep 'CPU(s):' | awk '{print $2}')
   [ $ncores -eq 0 ] && ncores=1 # have to assume something...

   IFS=" ." read min mind fiv fivd fteen fteend rest < /proc/loadavg
   # for loghistory
   loadaverage="loadavg:1m:${min}.${mind},5m:${fiv}.${fivd},15m:${fteen}.${fteend}"
   # force fivd to be treated as decimal below
   (( avg5 = ((100*$fiv) + 10#$fivd) / $ncores ))
   (( upavg5 = $ncores * $avg5 )) # loadavg style, for clarity
   (( dispavg = $avg5 / 100 ))
   (( dispavgd = $avg5 % 100 ))
   (( updispavg = $upavg5 / 100 ))
   (( updispavgd = $upavg5 % 100 ))
   [ $dispavgd -lt 10 ] && dispavgd=0${dispavgd}
   [ $updispavgd -lt 10 ] && updispavgd=0${dispavgd}
   avg=${dispavg}.${dispavgd}
   upavg=${updispavg}.${updispavgd}

    if [ ${avg5} -ge ${loadcrit} ]; then
	if [ ${avg5} -gt ${prevloadcrit} ]; then
	    log crit Critically high load average: ${avg} "(${upavg})"
	    ret=1
	fi
	prevloadcrit=$((avg5+5)) prevloadalarm=
    elif [ ${avg5} -ge ${loadalarm} ]; then
	if [ ${avg5} -gt ${prevloadalarm} ]; then
	    log alert High load average: ${avg} "(${upavg})"
	    ret=1
	fi
	prevloadalarm=$((avg5+10)) prevloadcrit=
    else
       if [ ${prevloadcrit} -gt 0 ]; then
	    log notice Load Average no longer critically high: ${avg} "(${upavg})"
       elif [ ${prevloadalarm} -gt 0 ]; then
	    log notice Load Average no longer high: ${avg} "(${upavg})"
       fi
       prevloadalarm=  prevloadcrit=
    fi
    return $ret
}

declare -i lasttopcpu=0
# Gather the top cpu users over a 60 second period, and log those that
# have 1% or higher cpu utilization.  It's sensitive to the exact
# format of the top command output, and uses a special .toprc to choose the
# fields we want.
# Don't run if it's been less than 5 minutes since our last run.
function topcpu_users()
{
    local -i now=$(date +%s) delta

    (( delta = now - lasttopcpu ))
    [ $delta -lt 300 ] && return

    # set HOME to get special .toprc
    log info Top CPU processes
    HOME=/usr/share/cumulus-tools top -b -o'%CPU' -H -n2 -d 60 | \
        sed -n -e '1,6d'  -e '8,/PID/d' -e p  | \
        egrep -v '0\.[0-9]+  *[0-9]\.[0-9]' | log info &
    lasttopcpu=$now
}

declare -i lasttopmem=0
# Get top 10 memory users
# Don't run if it's been less than 1 minute since our last run.
# Background to not delay other processing
function topmem_users()
{
    local -i now=$(date +%s) delta
    local tfile

    (( delta = now - lasttopmem ))
    [ $delta -lt 60 ] && return

    lasttopmem=$now

    (
    tfile=$svcdir/mem_$now
    tpipe=$svcdir/mpipe_$now

    mknod $tpipe p

    echo Top Memory processes > $tfile
    ps -e -o pid,rss,vsz,args | cut -c1-80 |
        tee $tpipe | (head -1 $tpipe; sed -e '2,$s/:/./g' |
        sort -r -n -k2,3 | head) >> $tfile
    log info < $tfile
    rm -f $tfile $tpipe
    ) &
}

# check total cpu utilization over all cores
# return 1 if we went over one of the thresholds
function chkcpu()
{
   local -i up idle val thisup thisidle ret=0
   local up_d idle_d # fractional part not declared numeric
	# because it can have a leading 0, and don't want readd
        # to get an error
   local rest

   # we need to determine number of cores if we don't already have it
   [ $ncores -eq 0 ] && ncores=$(lscpu | grep 'CPU(s):' | awk '{print $2}')
   [ $ncores -eq 0 ] && ncores=1 # have to assume something...

   IFS=" ." read up up_d idle idle_d rest < /proc/uptime

   (( up = (${up} * 100) + 10#${up_d} ))
   (( idle = (${idle} * 100) + 10#${idle_d} ))

   # If this is our first pass, we can't calculate cpu use, just init
   if [ ${prevuptime} -eq 0 -o ${previdletime} -eq 0 ] ; then
	prevuptime=${up}
	previdletime=${idle}
        return $ret
   fi
   thisup=${up}
   thisidle=${idle}
   (( up -= ${prevuptime} ))
   (( idle -= ${previdletime} ))
   (( idle /= ${ncores} ))
   (( cpu = (100 * (${up} - ${idle})) / ${up} ))
    prevuptime=${thisup}
    previdletime=${thisidle}
   cpu_util="cpu_used:${cpu}%" # for loghistory

    if [ ${cpu} -ge ${cpucrit} ]; then
	if [ ${cpu} -gt ${prevcpucrit} ]; then
	    log crit Critically high CPU use: ${cpu}'%'
	    ret=1
	fi
	prevcpucrit=${cpu} prevcpualarm=
    elif [ ${cpu} -ge ${cpualarm} ]; then
	if [ ${cpu} -gt ${prevcpualarm} ]; then
	    log alert High CPU use: ${cpu}'%'
	    ret=1
	fi
	prevcpualarm=${cpu} prevcpucrit=
    else
       if [ ${prevcpucrit} -gt 0 ]; then
	    log notice CPU use no longer critically high: ${cpu}'%'
       elif [ ${prevcpualarm} -gt 0 ]; then
	    log notice CPU use no longer high: ${cpu}'%'
       fi
       prevcpualarm=  prevcpucrit=
    fi
    return $ret
}



# main execution starts here

debug Limits are diskcrit=${diskcrit}, diskalarm=${diskalarm}, \
  memalarm=${memalarm}, memcrit=${memcrit}, cpualarm=${cpualarm}, \
  cpucrit=${cpucrit}, loadalarm=${loadalarm}, loadcrit=${loadcrit}.

debug Previous event values are prevdiskalarm=${prevdiskalarm}, \
    prevdiskcrit=${prevdiskcrit}, prevmemalarm=${prevmemalarm}, \
    prevmemcrit=${prevmemcrit}, prevcpualarm=${prevcpualarm}, \
    prevcpucrit=${prevcpucrit} prevloadalarm=${prevloadalarm}, \
    prevloadcrit=${prevloadcrit} prevuptime=${prevuptime},
    previdletime=${previdletime}


declare -i hiload hicpu himem
while :; do
    chkmem
    himem=$?

    chkrootfs

    chkload
    hiload=$?
    chkcpu
    hicpu=$?

    [ $hiload -gt 0 -o $hicpu -gt 0 ] && topcpu_users

    [ $himem -gt 0 ] && topmem_users

    loghistory
    savestate

    sleep 60
done
