#! /bin/bash
# Copyright 2017,2018,2019,2020 Cumulus Networks, Inc.  All rights reserved.
#
# Collect platform hardware info and state, as well as basic
# system state (logs, systemd info, etc.); used by cl-support
#
# Note that cl-support expects the below three flags to be commented
#TIMEOUT=150
#DEFAULT
#ONCORE=all

module=${0##*/}

funcs=(versions logs systemd dmesg hwinfo memory_use configs pkg misc uefi time)
nondef_funcs=(pkgverify)
json=
jexec=

system_versions()
{
	# Capture general system version information
	[ -x /usr/bin/lsb_release ] &&
		exec_cmd lsb_release /usr/bin/lsb_release -a

	# Grab bootloader information
	[ -x /usr/bin/fw_printenv ] && exec_cmd fw_printenv fw_printenv
	[ -d /boot/grub/grub.cfg ] && {
		exec_cmd grub-editenv grub-editenv list
        cp -a --parents /boot/grub/grub.cfg $SUP_TOPDIR
    }

    [ -x /usr/lib/cumulus/onie/onie-version ] &&
        exec_cmd onie-version /usr/lib/cumulus/onie/onie-version

    cp -a --parents /boot/config-"$(uname -r)" $SUP_TOPDIR

    exec_cmd dmidecode dmidecode
    exec_cmd bios bios_version
    exec_cmd cpld cpld_version
}

system_hwinfo()
{
    local full fans fan
	exec_cmd platform.detect platform-detect
	exec_cmd $jexec decode-syseeprom /usr/cumulus/bin/decode-syseeprom -t all $json
	exec_cmd $jexec decode-syseeprom.psu1 /usr/cumulus/bin/decode-syseeprom -t psu1 $json
	exec_cmd $jexec decode-syseeprom.psu2 /usr/cumulus/bin/decode-syseeprom -t psu2 $json
	fans=$(decode-syseeprom -h |& sed -n -e 's/[ ,()]/\
/gp' | grep fan)
	for fan in $fans; do
	  exec_cmd $jexec decode-syseeprom.$fan /usr/cumulus/bin/decode-syseeprom -t $fan $json
	done
	exec_cmd sensors sensors
	exec_cmd $jexec smonctl smonctl -v $json
	exec_cmd pwmd pwmd -d
	exec_cmd $jexec ledmgrd ledmgrd -d $json
	exec_cmd lspci lspci -vv -xx # -vvv or -xxx can trigger chip errata
    full=$(type -p poectl)
    [ -n "$full" ] && {
        exec_cmd $jexec poe_diag poectl $json --diag-info --verbose
        exec_cmd $jexec poe_sys poectl $json -s --verbose
        exec_cmd $jexec poe_sys_i poectl $json -i
    }
    
    read rootdev rest  <<< $(df / | grep ^/dev)
    read scsidev rest  <<< $(echo /dev/disk/by-path/*scsi*0)

    type ubinfo >& /dev/null && exec_cmd ubifs.info ubinfo -a
    type mtdinfo >& /dev/null && exec_cmd mtd.info mtdinfo -a

    [ -n "$rootdev" ] && {
        type e2freefrag >& /dev/null && exec_cmd ext4frag.info e2freefrag $rootdev
        type tune2fs >& /dev/null && exec_cmd ext4fs.info tune2fs -l $rootdev
        [ -e "$scsidev" ] && scsi="-d scsi"
        local smartopts="-H -i -A -f brief -c -l error -l selftest"
        exec_cmd smart smartctl $smartopts $scsi $rootdev
    }

    exec_cmd ssd_health.show /usr/lib/cumulus/dump_ssd_health

}

run_df()
{
    local cpid
    date +"# %F_%T.%N: df" &>> df
	df &>> df &
    cpid=$!
    disown -a
    sleep 5
    (kill -0 $cpid 2>/dev/null && { date +"# %F_%T.%N: df Timed out" &>>df
        kill $cpid 2>/dev/null ; }) || date +"# %F_%T.%N: Completed df" &>>df
}

system_misc()
{
    local splitf=/tmp/nf$$. f d

    [ $SUP_VERBOSE -eq 1 ] &&
        date +"%F_%T.%N: ${module}.${FUNCNAME[0]}: Add /proc files to archive" 1>&2
	cp -a --parents /proc/buddyinfo /proc/cmdline /proc/consoles /proc/cpuinfo \
        /proc/devices /proc/diskstats /proc/interrupts /proc/iomem \
        /proc/ioports /proc/kallsyms /proc/loadavg /proc/locks \
        /proc/meminfo /proc/misc /proc/modules /proc/self/mounts /proc/mtd \
        /proc/self/net /proc/pagetypeinfo /proc/partitions /proc/softirqs \
        /proc/stat /proc/swaps /proc/sysvipc /proc/timer_list /proc/uptime \
        /proc/version /proc/vmallocinfo /proc/vmstat /proc/zoneinfo \
        $SUP_TOPDIR &
	# sysfs data - need to avoid arglist too long with lots of interfaces, so
    # get list of readable files (so we don't have to hide cp errors), then
    # split into 2000 file chunks (will always work with ulimit -s value we set)
    # then copy them into top level sys dir where the archiver will pick them up.
    {
    [ $SUP_VERBOSE -eq 1 ] &&
        date +"%F_%T.%N: ${module}.${FUNCNAME[0]}: Add /sys files to archive" 1>&2
    rgrep -a -l . /sys/class/net/* 2>/dev/null | split -l2000 - ${splitf}
    for f in ${splitf}*; do
        cp --parents --preserve=all $(cat $f) $SUP_TOPDIR
    done
    rm -f ${splitf}*
    } &
    wait # /proc and /sys in parallel

	{
    cp -a --parents /var/lib/cumulus $SUP_TOPDIR # platform status, etc
	# capture history of command failures and heartbeat timeouts, if any
	cp -a --parents /run/failure.* /run/watchdog.* /run/problems $SUP_TOPDIR 2>/dev/null
    # and sysmonitor status
	[ -d /run/sysmonitor ] && cp -a --parents /run/sysmonitor $SUP_TOPDIR
	[ -e /run/utmp ] && cp -a --parents /run/utmp $SUP_TOPDIR
	[ -e /run/nclu ] && cp -a --parents /run/nclu $SUP_TOPDIR 
	[ -e /run/tacacs_client_map ] && cp -a --parents /run/tacacs_client_map \
	   $SUP_TOPDIR
	cp -a --parents /var/spool/cron $SUP_TOPDIR
	[ `ls -1 /var/cache/cumulus/port* 2> /dev/null | wc -l ` -gt 0 ] && cp -a --parents /var/cache/cumulus/port* $SUP_TOPDIR
	[ -f /proc/linux-kernel-bde ] && cp -a --parents /proc/*-bde $SUP_TOPDIR
	[ -f /proc/dma ] && cp -a --parents /proc/dma $SUP_TOPDIR
	for d in /proc/bcm /proc/mlx_sx; do
	   [ -d $d ] && cp -a --parents $d $SUP_TOPDIR
	done
	find $SUP_TOPDIR/run -type s -exec rm -f '{}' \; # nclu, others
    } &
    wait # all other hierachy copies in parallel

	exec_cmd cpu-vulnerabilities rgrep . /sys/devices/system/cpu/vulnerabilities
	exec_cmd sysctl sysctl -a
	exec_cmd who who -aH
	exec_cmd lastlog lastlog
	exec_cmd uptime uptime
	exec_cmd mount mount
    run_df
	exec_cmd parted-list parted --list
	exec_cmd blkid blkid
	exec_cmd lsblk lsblk
}


system_uefi()
{
	[ -d /sys/firmware/efi ] || return 0

	# UEFI boot order
	if which efibootmgr > /dev/null 2>&1 ; then
		exec_cmd "efibootmgr" efibootmgr -v
	fi

	# EFI sysfs data, including EFI variables
	cp -a --parents /sys/firmware/efi $SUP_TOPDIR 2> /dev/null

}

system_pkgverify()
{
	# verify package contents against installed files
	exec_cmd dpkg.verify dpkg --verify
}

system_pkg()
{
	# packages installed
	exec_cmd dpkg.installed dpkg -l
}

system_dmesg()
{
	# Capture any kernel logs that exist in the kernel's internal
	# log ring buffer that have not made it to disk yet.
	exec_cmd dmesg dmesg
	return
}

system_systemd()
{
	# Capture systemd information that may help debug problems

	exec_cmd systemd.journal journalctl -l -o short-precise --no-pager | tail --bytes=8388608 &
	exec_cmd cl-service-summary cl-service-summary
	exec_cmd systemd.failed systemctl -l --no-pager --failed
	exec_cmd systemd.status systemctl -l --no-pager status
	exec_cmd systemd.units systemctl -l --no-pager list-units
	exec_cmd systemd.unitfiles systemctl -l --no-pager list-unit-files
	exec_cmd systemd.delta systemd-delta
	wait
}

system_memory_use()
{
	exec_cmd ps.aux ps aux
	exec_cmd ps.threads ps -Teo pid,tid,uid,size,rss,pri,flag,cls,stat,psr,start,time,wchan:20,comm,args
	exec_cmd free free -l -m
	exec_cmd slabtop slabtop -o < /dev/null
	[ -f /proc/slabinfo ] &&
		exec_cmd slabinfo cat /proc/slabinfo < /dev/null
	exec_cmd vmstat vmstat 1 5
	exec_cmd vmstat.m vmstat -m
	exec_cmd vmstat.s vmstat -s
}


# make a top-level symlink for /var/log, so they are added to the archive
# by cl-support
system_logs()
{
    mkdir -p ${SUP_TOPDIR}/var
    [ $SUP_VERBOSE -eq 1 ] &&
        date +"%F_%T.%N: ${module}.${FUNCNAME[0]}: Add /var/log to archive" 1>&2
    local loguse
    loguse=$(du -ms /var/log | sed 's/[ \t].*//') 2>/dev/null
    case "$loguse" in
    [1-9]*[0-9]) [ $loguse -gt 50 ] && {
        echo cl-support.${module}: /var/log is large: ${loguse}MB 1>&2
        logger -t cl-support.${module} -p warn /var/log is large: ${loguse}MB
        }
    ;;
    esac
    ln -s /var/log ${SUP_TOPDIR}/var/log
}

# Copy (not link) the /etc directory, so files can be editted.
# to remove sensitive info; remove etckeeper .bzr and/or .git
# to avoid sensitive info and size
system_configs()
{
    [ $SUP_VERBOSE -eq 1 ] &&
        date +"%F_%T.%N: ${module}.${FUNCNAME[0]}: Add /etc (configs) to archive" 1>&2
    cp -a /etc ${SUP_TOPDIR}/etc
    rm -rf  ${SUP_TOPDIR}/etc/.git ${SUP_TOPDIR}/etc/.bzr ${SUP_TOPDIR}/etc/os-release
}

s6000_get_cpld_versions()
{
    for cpld in /sys/bus/i2c/drivers/dummy/*3*; do
        cpld=${cpld##*/};
        bus=$(echo $cpld | cut -d '-' -f1)
        addr=$(echo $cpld | cut -d '-' -f2)
        case $addr in
            0031) ver=0x$(i2cget -f -y $bus 0x$addr 0 | cut -c 4)
            echo system_cpld=$ver ;;
            0032) ver=0x$(i2cget -f -y $bus 0x$addr 1 | cut -c 4)
            echo master_cpld=$ver ;;
            0033) ver=0x$(i2cget -f -y $bus 0x$addr 0xa | cut -c 4)
            echo slave_cpld=$ver ;;
        esac
    done
}


s4000_get_cpld_versions()
{
    for cpld in /sys/bus/i2c/drivers/dummy/*3*; do
        cpld=${cpld##*/};
        bus=$(echo $cpld | cut -d '-' -f1)
        addr=$(echo $cpld | cut -d '-' -f2)
        case $addr in
            0031) ver=0x$(i2cget -f -y $bus 0x$addr 0 | cut -c 4)
                  echo system_cpld=$ver ;;
            0032) ver=0x$(i2cget -f -y $bus 0x$addr 1 | cut -c 4)
                  echo master_cpld=$ver ;;
            0033) ver=0x$(i2cget -f -y $bus 0x$addr 0 | cut -c 4)
                  echo slave_cpld=$ver ;;
        esac
    done
}

# Check for /sys/devices files for cpld versions; not all platforms
get_cpld_versions()
{
    files=$(find /sys/devices -type f | egrep -i cpld'.*(version|revision)')
    if [ ! "$files" ]; then
        dirs=$(find /sys/devices -type d -iname \*cpld\*)
        if [ "$dirs" ]; then
            files=$(find $dirs -type f | egrep -i cpld'.*(version|revision)')
        fi
    fi
    if [ "$files" ]; then
        grep -H . $files | sed -e 's,.*/,,' -e 's/,//g' -e 's/: */=/g'
    # else no /sys cpld versions on this platform.
    fi
}

bios_version()
{
    dmidecode 2>/dev/null | sed -n '/^BIOS.In/,/^$/{
        /Vendor/p
        /Version/p
        /Date/p
        /Revision/p
    }'
}

cpld_version()
{
    local platform=$(platform-detect 2>/dev/null)
    case $platform in
    dell,s4000_c2338) s4000_get_cpld_versions ;;
    dell,s6000_s1220) s6000_get_cpld_versions ;;
    *) get_cpld_versions ;;
    esac
}

system_time()
{
    local isvrf cmd
    local clk_extra
    local platform=$(platform-detect 2>/dev/null)

    case $platform in
    quanta,ly6_rangeley |\
    quanta,ly8_rangeley |\
    quanta,ly9_rangeley ) clk_extra="-f /dev/rtc1" ;;
    *) clk_extra="" ;;
    esac
    exec_cmd time.hwclock hwclock -r $clk_extra
    exec_cmd time.timedatectl timedatectl status

    vrf task list mgmt 2>/dev/null | grep -sq ntpd
    isvrf=$?
    for cmd in peers associations kerninfo sysinfo sysstats; do
        if [ $isvrf -eq 0 ]; then
             exec_cmd time.$cmd vrf task exec mgmt ntpq -nc $cmd
        else
             exec_cmd time.$cmd ntpq -nc $cmd
        fi
    done
}


# main

while getopts "c:jl" Option; do
    case $Option in
    c) [ $SUP_VERBOSE -eq 1 ] && # default submods
        echo ${module}: Invoked for "$OPTARG" core dumps 1>&2 ;;
    j) jexec=-j json=--json ;;
    l) echo ${funcs[@]} ';' ${nondef_funcs[@]}
       exit 0 ;;
    *) ;;
    esac
done
shift $((OPTIND - 1))

main()
{
    local -r TIMEFORMAT='%2R seconds' tfile=/run/${module}_funcstime$$
    local secs
    if [ $# -ne 0 ]; then # only run specified sub-modules
       [ $SUP_VERBOSE -eq 1 ] && echo ${module}: run only submodules: $@ 1>&$stderr
       funcs=($@)
    fi

    ulimit -s 65536 # make overflows less likely, and xargs splits larger

    for func in ${funcs[@]}; do
        [ $SUP_VERBOSE -eq 1 ] && echo "$module.$func" 1>&2
        date +"### $module.$func Started at %F-%T.%N"
        { time system_$func 2>&$stderr  ; } 2>$tfile
        read secs < $tfile
        echo "### $module.$func Completed in $secs"
    done
    rm $tfile
}

TIMEFORMAT="Module $module Completed in %2R seconds"
exec 42>&2
stderr=42

{ time main $@ 2>&$stderr ; } 2>&1

exit 0
