#!/bin/ksh #--------------------------------------------------------------------------------# # This file is part of PALM. # # PALM is free software: you can redistribute it and/or modify it under the terms # of the GNU General Public License as published by the Free Software Foundation, # either version 3 of the License, or (at your option) any later version. # # PALM is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along with # PALM. If not, see . # # Copyright 1997-2014 Leibniz Universitaet Hannover #--------------------------------------------------------------------------------# # # Current revisions: # ----------------- # # # Former revisions: # ----------------- # $Id: hlrn_watchdog 1449 2014-08-07 10:47:26Z boeske $ # # 1448 2014-08-07 10:47:05Z maronga # Bugfix: removed debug mode # # 1446 2014-08-07 10:08:56Z maronga # Adapted for HLRN III. Added windows if no runs are queued. # # 1046 2012-11-09 14:38:45Z maronga # code put under GPL (PALM 3.9) # # 1029 2012-10-17 15:33:56Z maronga # Initial revision # # Description: # ------------ # The hlrn_watchdog works for jobs on HLRN in Hannover/Berlin and with KDE. # It can be used for monitoring currently submitted jobs and will display # Running, Idle and Blocked jobs in a separate window, which is updated every # 10 minutes. # Starting: "hlrn_watchdog start " # Stopping: "hlrn_watchdog stop" #------------------------------------------------------------------------------! check_hannover=true check_berlin=true debug=false cd $PALM_BIN gate_h="hlogin.hlrn.de" gate_b="blogin.hlrn.de" update_frequency=600 # trap strg+c trap 'kill -9 $infoPID > /dev/null; kill -9 $dialogPID > /dev/null; exit' 2 # start/stop routine if [[ $1 == "stop" ]] then result=`ps aux|grep -c "./hlrn_watchdog"` if (( $result > 2 )) then result=`ps aux|grep -m1 "./hlrn_watchdog"` killid=`echo $result | tr -s " " | cut -d" " -s -f2` kill -9 $killid > /dev/null killall kdialog if [[ -f .watchdog_report.x ]] then rm .watchdog_report.x fi if [[ -f .watchdog_status.x ]] then rm .watchdog_status.x fi echo "*** hlrn_watchdog stopped." else echo "+++ hlrn_watchdog is not running." fi exit elif [[ $1 == "start" ]] then result=`ps aux|grep -c "hlrn_watchdog"` if (( $result > 2 )) then echo "+++ hlrn_watchdog is already running." else if [[ $debug = true ]] then ./hlrn_watchdog $2 & else nohup ./hlrn_watchdog $2 1> /dev/null 2> /dev/null & fi echo "\n*** hlrn_watchdog starting..." fi exit else # login via ssh and collect information in .watchdog_report.x while true do touch .watchdog_report.x if [[ $check_hannover == true ]] then ssh $gate_h -l $1 "showq | egrep \"($1)\"" > .watchdog_report.x fi if [[ $check_berlin == true ]] then ssh $gate_b -l $1 "showq | egrep \"($1)\"" >> .watchdog_report.x fi i=0 j=0 cat .watchdog_report.x|while read variable do # analyze output comid[$i]=`echo $variable | tr -s " " | cut -d" " -s -f1` jobid[$i]=`echo ${comid[$i]} | tr -s " " | cut -d"." -s -f2` complex[$i]=`echo ${comid[$i]} | tr -s " " | cut -d"." -s -f1` username[$i]=`echo $variable | tr -s " " | cut -d" " -s -f2` status[$i]=`echo $variable | tr -s " " | cut -d" " -s -f3` nodes[$i]=`echo $variable | tr -s " " | cut -d" " -s -f4` walltime[$i]=`echo $variable | tr -s " " | cut -d" " -s -f5` day[$i]=`echo $variable | tr -s " " | cut -d" " -s -f6` month[$i]=`echo $variable | tr -s " " | cut -d" " -s -f7` date[$i]=`echo $variable | tr -s " " | cut -d" " -s -f8` subtime[$i]=`echo $variable | tr -s " " | cut -d" " -s -f9` ((i = i + 1)) done rm .watchdog_report.x touch .watchdog_status.x # check for terminated jobs and status changes k=0 cat .watchdog_status.x|while read variable do # analyze output old_comid[$k]=`echo $variable | tr -s " " | cut -d" " -s -f1` old_status[$k]=`echo $variable | tr -s " " | cut -d" " -s -f2` ((k = k + 1)) done rm .watchdog_status.x info="" for ((m=0;m<$k;m++)) do found=0 for ((n=0;n<$i;n++)) do if [[ ${old_comid[$m]} == ${comid[$n]} ]] then if [[ ${old_status[$m]} != ${status[$n]} ]] then info="$info ${old_comid[$m]} has changed status from ${old_status[$m]} to ${status[$n]}.\n" fi found=1 break fi done if (( $found == 0 )) then info="$info ${old_comid[$m]} has been terminated (Status was ${old_status[$m]}).\n" fi done # check whether any jobs are queued touch .watchdog_report.x file_size=`ls -l .watchdog_report.x | tr -s " " | cut -d " " -f 5` if [[ $file_size == 0 ]] then printf "No jobs queued.\n" >> .watchdog_report.x fi # get estimated starting time for all idle jobs and write watchdog output in .watchdog_report.x while (( $j < $i )) do if [[ ${status[$j]} == "Idle" ]] then if [[ ${complex[$j]} == "hannover" ]] then eststart[$j]=`ssh $gate_h -l $1 "showstart ${comid[$j]}|grep \"based start in\""` else eststart[$j]=`ssh $gate_b -l $1 "showstart ${comid[$j]}|grep \"based start in\""` fi eststart[$j]="Start in: "`echo ${eststart[$j]} | tr -s " " | cut -d" " -s -f6` else eststart[$j]="" fi # write final output line ((k = j + 1)) printf "%-9s%8s%3s%04i%13s%11s%2s%-s%3s%-20s\n" "Job: $k:" "${status[$j]}" " @ " "${nodes[$j]}" " nodes, time:" "${walltime[$j]}" " (" "${comid[$j]}" "). " "${eststart[$j]}" >> .watchdog_report.x printf "${comid[$j]} ${status[$j]}\n" >> .watchdog_status.x ((j = j + 1)) done # kill all windows if [[ "$infoPID" -ne "" ]] then kill -9 $infoPID > /dev/null unset $infoPID fi if [[ "$dialogPID" -ne "" ]] then kill -9 $dialogPID > /dev/null unset $dialogPID fi timestamp=`date` # create window and show information kdialog --textbox .watchdog_report.x 550 150 --title "HLRN watchdog (last update: $timestamp)" & dialogPID=$! # in case of status changes and terminated jobs, inform the user if [[ $info != "" ]] then kdialog --msgbox "$info" --title "HLRN Job Information" & infoPID=$! fi sleep $update_frequency done fi