source: palm/trunk/SCRIPTS/palm_wdd @ 1618

Last change on this file since 1618 was 1618, checked in by maronga, 9 years ago

watchdog is now steered via configuration file

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.7 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22# Added steering via configuration file, to be placed in the home directory of the
23# remote host to be monitored.
24#
25# Former revisions:
26# -----------------
27# $Id: palm_wdd 1618 2015-07-13 06:52:15Z maronga $
28#
29# 1613 2015-07-08 14:53:29Z maronga
30# Bugfix: tooltip for queuing name did not show up on first update.
31# New: added contect menu for showing the parameter file and the run control
32# output
33#
34# 1611 2015-07-07 12:23:22Z maronga
35# Initial revision
36#
37#
38# Description:
39# ------------
40# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
41# by the user. The watchdog server requires python 2.7 or higher installed on
42# host to be monitored.
43#
44# Instructions:
45# -------------
46# 1) Modify the header section of palm_wd
47# 2) Move .wd.olddata and .wd.newdata to your palm directory
48#    (e.g. /home/user/current_version/.wd.newdata etc.)
49# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
50#    respective hosts
51# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
52#
53# To do:
54# ------
55# 1) Add "Options", "Help" and "Manual"
56# 2) Move user settings to a configuration file
57#------------------------------------------------------------------------------!
58
59import ConfigParser
60import os
61from subprocess import check_output
62import sys
63
64
65# Read configuration file
66# First check if the configuration file exists
67if ( os.path.exists('.wdd.config') == False ):
68    print "Error. No configuration file .wdd.config found."
69    raise SystemExit     
70
71config = ConfigParser.RawConfigParser()
72config.read('.wdd.config')
73
74cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
75cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
76cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
77cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
78cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
79cmd_starttime      = config.get('Settings', 'starttime').strip('"')
80cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
81
82
83action   = str(sys.argv[1])
84data     = str(sys.argv[2])
85if ( len(sys.argv) > 3 ):
86   data2 =  str(sys.argv[3])
87
88
89cmd_readqueue = cmd_readqueue + " " + data
90cmd_tmpdir    = cmd_tmpdir + data
91
92# reading queuing system
93def ReadQueue(username):
94 
95#  collect queuing information
96   try:
97      out = check_output(cmd_readqueue, shell=True)
98      job_list = out.splitlines()
99      out = None
100#  do nothing for empty results list
101   except:
102      job_list = []
103 
104 
105   job_data_tmp = []
106   for j in range(0,len(job_list)):
107
108      # Write temporary data array containing the job information.
109      job_data_tmp.append(j)
110      job_data_tmp[j] = job_list[j].split(" ")
111      job_data_tmp[j] = filter(None, job_data_tmp[j])
112
113      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
114
115      # retrieve real job name for all jobs
116      try: 
117         out = check_output(cmd_realname, shell=True)
118         job_realname = out.split(" ")[1].rstrip()
119      except:
120         job_realname = "error"
121
122
123      # for running jobs, determine progress                 
124      if ( job_data_tmp[j][2] == "Running" ):
125 
126         # collect progress information
127         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
128         try: 
129
130            devnull = open(os.devnull, 'w')
131            out = check_output(cmd_progress, shell=True, stderr=devnull)
132            progress_lines = out.splitlines()
133            job_progress = progress_lines[1].split(" ")[1]
134            out = None
135         except:
136            job_progress = "0" 
137
138      else:
139         job_progress = "0" 
140
141      # return the job data
142      job_starttime = GetStartTime(job_data_tmp[j][0])
143      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
144
145
146# check details of specific job
147def CheckJob(jobid):
148
149   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
150
151   try: 
152      out = check_output(cmd_checkjob_tmp, shell=True)
153      job_details = out
154   except:
155      job_details = "No details available." 
156
157   return job_details
158
159
160# cancel a specific job
161def CancelJob(jobid):
162
163   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
164
165   try: 
166      out = check_output(cmd_canceljob_tmp, shell=True)
167      job_canceled = out
168   except:
169      job_canceled = "Action failed." 
170
171   return job_canceled
172
173
174# retrieve estimated start time of job
175def GetStartTime(jobid):
176
177   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
178
179   try: 
180      out = check_output(cmd_starttime_tmp, shell=True)
181      job_starttime = out.split()[5]
182   except:
183      job_starttime = "Action failed." 
184
185   return job_starttime
186
187
188def DoStopNow(username,jobid):
189
190   # collect progress information
191   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
192   try: 
193      devnull = open(os.devnull, 'w')
194      out = check_output(cmd_dostop, shell=True, stderr=devnull)
195      out = None
196   except:
197      return_message = "Action failed." 
198      return return_message
199
200def DoRestartNow(username,jobid):
201
202   # collect progress information
203   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
204   try: 
205      devnull = open(os.devnull, 'w')
206      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
207      out = None
208   except:
209      return_message = "Action failed." 
210      return return_message
211
212def GetPARIN(username,jobid):
213
214   # collect progress information
215   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/PARIN"
216   try: 
217      devnull = open(os.devnull, 'w')
218      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
219      return_message = out
220      out = None
221   except:
222      return_message = "Action failed." 
223     
224   return return_message
225
226def GetRC(username,jobid):
227
228   # collect progress information
229   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/RUN_CONTROL"
230   try: 
231      devnull = open(os.devnull, 'w')
232      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
233      return_message = out
234      out = None
235   except:
236      return_message = "Action failed." 
237     
238   return return_message
239
240# START OF MAIN
241if ( action == "queue" ):
242   ReadQueue(data)
243elif ( action == "check"):
244   print CheckJob(data)
245elif ( action == "cancel"):
246   print CancelJob(data)
247elif ( action == "start"):
248   print GetStartTime(data)
249elif ( action == "stop"):
250   print DoStopNow(data,data2)
251elif ( action == "restart"):
252   print DoRestartNow(data,data2)
253elif ( action == "parin"):
254   print GetPARIN(data,data2)
255elif ( action == "rc"):
256   print GetRC(data,data2) 
257else:
258   print "Error. Action " + action + " unknown."
259
Note: See TracBrowser for help on using the repository browser.