#!/bin/ksh
#--------------------------------------------------------------------------------#
# This file is part of PALM.
#
# PALM is free software: you can redistribute it and/or modify it under the terms
# of the GNU General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# PALM. If not, see .
#
# Copyright 1997-2014 Leibniz Universitaet Hannover
#--------------------------------------------------------------------------------#
#
# Current revisions:
# -----------------
#
#
# Former revisions:
# -----------------
# $Id: hlrn_watchdog 1449 2014-08-07 10:47:26Z witha $
#
# 1448 2014-08-07 10:47:05Z maronga
# Bugfix: removed debug mode
#
# 1446 2014-08-07 10:08:56Z maronga
# Adapted for HLRN III. Added windows if no runs are queued.
#
# 1046 2012-11-09 14:38:45Z maronga
# code put under GPL (PALM 3.9)
#
# 1029 2012-10-17 15:33:56Z maronga
# Initial revision
#
# Description:
# ------------
# The hlrn_watchdog works for jobs on HLRN in Hannover/Berlin and with KDE.
# It can be used for monitoring currently submitted jobs and will display
# Running, Idle and Blocked jobs in a separate window, which is updated every
# 10 minutes.
# Starting: "hlrn_watchdog start "
# Stopping: "hlrn_watchdog stop"
#------------------------------------------------------------------------------!
check_hannover=true
check_berlin=true
debug=false
cd $PALM_BIN
gate_h="hlogin.hlrn.de"
gate_b="blogin.hlrn.de"
update_frequency=600
# trap strg+c
trap 'kill -9 $infoPID > /dev/null; kill -9 $dialogPID > /dev/null; exit' 2
# start/stop routine
if [[ $1 == "stop" ]]
then
result=`ps aux|grep -c "./hlrn_watchdog"`
if (( $result > 2 ))
then
result=`ps aux|grep -m1 "./hlrn_watchdog"`
killid=`echo $result | tr -s " " | cut -d" " -s -f2`
kill -9 $killid > /dev/null
killall kdialog
if [[ -f .watchdog_report.x ]] then
rm .watchdog_report.x
fi
if [[ -f .watchdog_status.x ]] then
rm .watchdog_status.x
fi
echo "*** hlrn_watchdog stopped."
else
echo "+++ hlrn_watchdog is not running."
fi
exit
elif [[ $1 == "start" ]]
then
result=`ps aux|grep -c "hlrn_watchdog"`
if (( $result > 2 ))
then
echo "+++ hlrn_watchdog is already running."
else
if [[ $debug = true ]] then
./hlrn_watchdog $2 &
else
nohup ./hlrn_watchdog $2 1> /dev/null 2> /dev/null &
fi
echo "\n*** hlrn_watchdog starting..."
fi
exit
else
# login via ssh and collect information in .watchdog_report.x
while true
do
touch .watchdog_report.x
if [[ $check_hannover == true ]] then
ssh $gate_h -l $1 "showq | egrep \"($1)\"" > .watchdog_report.x
fi
if [[ $check_berlin == true ]] then
ssh $gate_b -l $1 "showq | egrep \"($1)\"" >> .watchdog_report.x
fi
i=0
j=0
cat .watchdog_report.x|while read variable
do
# analyze output
comid[$i]=`echo $variable | tr -s " " | cut -d" " -s -f1`
jobid[$i]=`echo ${comid[$i]} | tr -s " " | cut -d"." -s -f2`
complex[$i]=`echo ${comid[$i]} | tr -s " " | cut -d"." -s -f1`
username[$i]=`echo $variable | tr -s " " | cut -d" " -s -f2`
status[$i]=`echo $variable | tr -s " " | cut -d" " -s -f3`
nodes[$i]=`echo $variable | tr -s " " | cut -d" " -s -f4`
walltime[$i]=`echo $variable | tr -s " " | cut -d" " -s -f5`
day[$i]=`echo $variable | tr -s " " | cut -d" " -s -f6`
month[$i]=`echo $variable | tr -s " " | cut -d" " -s -f7`
date[$i]=`echo $variable | tr -s " " | cut -d" " -s -f8`
subtime[$i]=`echo $variable | tr -s " " | cut -d" " -s -f9`
((i = i + 1))
done
rm .watchdog_report.x
touch .watchdog_status.x
# check for terminated jobs and status changes
k=0
cat .watchdog_status.x|while read variable
do
# analyze output
old_comid[$k]=`echo $variable | tr -s " " | cut -d" " -s -f1`
old_status[$k]=`echo $variable | tr -s " " | cut -d" " -s -f2`
((k = k + 1))
done
rm .watchdog_status.x
info=""
for ((m=0;m<$k;m++))
do
found=0
for ((n=0;n<$i;n++))
do
if [[ ${old_comid[$m]} == ${comid[$n]} ]]
then
if [[ ${old_status[$m]} != ${status[$n]} ]]
then
info="$info ${old_comid[$m]} has changed status from ${old_status[$m]} to ${status[$n]}.\n"
fi
found=1
break
fi
done
if (( $found == 0 ))
then
info="$info ${old_comid[$m]} has been terminated (Status was ${old_status[$m]}).\n"
fi
done
# check whether any jobs are queued
touch .watchdog_report.x
file_size=`ls -l .watchdog_report.x | tr -s " " | cut -d " " -f 5`
if [[ $file_size == 0 ]] then
printf "No jobs queued.\n" >> .watchdog_report.x
fi
# get estimated starting time for all idle jobs and write watchdog output in .watchdog_report.x
while (( $j < $i ))
do
if [[ ${status[$j]} == "Idle" ]]
then
if [[ ${complex[$j]} == "hannover" ]]
then
eststart[$j]=`ssh $gate_h -l $1 "showstart ${comid[$j]}|grep \"based start in\""`
else
eststart[$j]=`ssh $gate_b -l $1 "showstart ${comid[$j]}|grep \"based start in\""`
fi
eststart[$j]="Start in: "`echo ${eststart[$j]} | tr -s " " | cut -d" " -s -f6`
else
eststart[$j]=""
fi
# write final output line
((k = j + 1))
printf "%-9s%8s%3s%04i%13s%11s%2s%-s%3s%-20s\n" "Job: $k:" "${status[$j]}" " @ " "${nodes[$j]}" " nodes, time:" "${walltime[$j]}" " (" "${comid[$j]}" "). " "${eststart[$j]}" >> .watchdog_report.x
printf "${comid[$j]} ${status[$j]}\n" >> .watchdog_status.x
((j = j + 1))
done
# kill all windows
if [[ "$infoPID" -ne "" ]] then
kill -9 $infoPID > /dev/null
unset $infoPID
fi
if [[ "$dialogPID" -ne "" ]] then
kill -9 $dialogPID > /dev/null
unset $dialogPID
fi
timestamp=`date`
# create window and show information
kdialog --textbox .watchdog_report.x 550 150 --title "HLRN watchdog (last update: $timestamp)" & dialogPID=$!
# in case of status changes and terminated jobs, inform the user
if [[ $info != "" ]]
then
kdialog --msgbox "$info" --title "HLRN Job Information" & infoPID=$!
fi
sleep $update_frequency
done
fi