source: palm/trunk/SCRIPTS/palm_wdd @ 2499

Last change on this file since 2499 was 2421, checked in by maronga, 7 years ago

bugfix in watchdog

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.7 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
23#
24# Former revisions:
25# -----------------
26# $Id: palm_wdd 2421 2017-09-07 10:36:34Z kanani $
27# Fixed display of job progress.
28#
29# 2416 2017-09-06 14:28:14Z maronga
30# Adapted for palmrun
31#
32# 1619 2015-07-13 06:53:19Z maronga
33#
34# 1618 2015-07-13 06:52:15Z maronga
35# Added steering via configuration file, to be placed in the home directory of the
36# remote host to be monitored.
37#
38# 1613 2015-07-08 14:53:29Z maronga
39# Bugfix: tooltip for queuing name did not show up on first update.
40# New: added contect menu for showing the parameter file and the run control
41# output
42#
43# 1611 2015-07-07 12:23:22Z maronga
44# Initial revision
45#
46#
47# Description:
48# ------------
49# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
50# by the user. The watchdog server requires python 2.7 or higher installed on
51# host to be monitored.
52#
53# Instructions:
54# -------------
55# 1) Modify the header section of palm_wd
56# 2) Move .wd.olddata and .wd.newdata to your palm directory
57#    (e.g. /home/user/current_version/.wd.newdata etc.)
58# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
59#    respective hosts
60# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
61#
62# To do:
63# ------
64# 1) Add "Options", "Help" and "Manual"
65# 2) Move user settings to a configuration file
66#------------------------------------------------------------------------------!
67
68import ConfigParser
69import os
70import pwd
71from subprocess import check_output
72import sys
73
74
75# Read configuration file
76# First check if the configuration file exists
77if ( os.path.exists('.wdd.config') == False ):
78    print "Error. No configuration file .wdd.config found."
79    raise SystemExit     
80
81config = ConfigParser.RawConfigParser()
82config.read('.wdd.config')
83
84cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
85cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
86cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
87cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
88cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
89cmd_starttime      = config.get('Settings', 'starttime').strip('"')
90cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
91
92
93action   = str(sys.argv[1])
94data     = str(sys.argv[2])
95
96cmd_readqueue = cmd_readqueue + " " + pwd.getpwuid( os.getuid() )[ 0 ]
97cmd_tmpdir    = cmd_tmpdir + pwd.getpwuid( os.getuid() )[ 0 ]
98
99# reading queuing system
100def ReadQueue(username):
101 
102#  collect queuing information
103   try:
104      out = check_output(cmd_readqueue, shell=True)
105      job_list = out.splitlines()
106      out = None
107#  do nothing for empty results list
108   except:
109      job_list = []
110 
111 
112   job_data_tmp = []
113   for j in range(0,len(job_list)):
114
115      # Write temporary data array containing the job information.
116      job_data_tmp.append(j)
117      job_data_tmp[j] = job_list[j].split(" ")
118      job_data_tmp[j] = filter(None, job_data_tmp[j])
119
120      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
121
122      # retrieve real job name for all jobs
123      try: 
124         out = check_output(cmd_realname, shell=True)
125         job_realname = out.split(" ")[1].rstrip()
126      except:
127         job_realname = "error"
128
129
130      # for running jobs, determine progress                 
131      if ( job_data_tmp[j][2] == "Running" ):
132 
133         # collect progress information
134         cmd_progress = "cat " + cmd_tmpdir + "/" + job_realname + "/PROGRESS"
135         try: 
136
137            devnull = open(os.devnull, 'w')
138            out = check_output(cmd_progress, shell=True, stderr=devnull)
139            progress_lines = out.splitlines()
140            job_progress = progress_lines[1].split(" ")[3]
141            out = None
142         except:
143            job_progress = "0" 
144
145      else:
146         job_progress = "0" 
147
148      # return the job data
149      job_starttime = GetStartTime(job_data_tmp[j][0])
150      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
151
152
153# check details of specific job
154def CheckJob(jobid):
155
156   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
157
158   try: 
159      out = check_output(cmd_checkjob_tmp, shell=True)
160      job_details = out
161   except:
162      job_details = "No details available." 
163
164   return job_details
165
166
167# cancel a specific job
168def CancelJob(jobid):
169
170   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
171
172   try: 
173      out = check_output(cmd_canceljob_tmp, shell=True)
174      job_canceled = out
175   except:
176      job_canceled = "Action failed." 
177
178   return job_canceled
179
180
181# retrieve estimated start time of job
182def GetStartTime(jobid):
183
184   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
185
186   try: 
187      out = check_output(cmd_starttime_tmp, shell=True)
188      job_starttime = out.split()[5]
189   except:
190      job_starttime = "Action failed." 
191
192   return job_starttime
193
194
195def DoStopNow(jobid):
196
197   # collect progress information
198   cmd_dostop = "touch " + cmd_tmpdir + "/" + jobid + "/DO_STOP_NOW"
199   try: 
200      devnull = open(os.devnull, 'w')
201      out = check_output(cmd_dostop, shell=True, stderr=devnull)
202      out = None
203   except:
204      return_message = "Action failed." 
205      return return_message
206
207def DoRestartNow(jobid):
208
209   # collect progress information
210   cmd_dorestart = "touch " + cmd_tmpdir + "/" + jobid + "/DO_RESTART_NOW"
211   try: 
212      devnull = open(os.devnull, 'w')
213      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
214      out = None
215   except:
216      return_message = "Action failed." 
217      return return_message
218
219def GetPARIN(jobid):
220
221   # collect progress information
222   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/PARIN"
223   try: 
224      devnull = open(os.devnull, 'w')
225      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
226      return_message = out
227      out = None
228   except:
229      return_message = "Action failed."   + "cat " + cmd_tmpdir + "/" + jobid + "/PARIN" 
230     
231   return return_message
232
233def GetRC(jobid):
234
235   # collect progress information
236   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/RUN_CONTROL"
237   try: 
238      devnull = open(os.devnull, 'w')
239      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
240      return_message = out
241      out = None
242   except:
243      return_message = "Action failed."
244     
245   return return_message
246
247# START OF MAIN
248if ( action == "queue" ):
249   ReadQueue(data)
250elif ( action == "check"):
251   print CheckJob(data)
252elif ( action == "cancel"):
253   print CancelJob(data)
254elif ( action == "start"):
255   print GetStartTime(data)
256elif ( action == "stop"):
257   print DoStopNow(data)
258elif ( action == "restart"):
259   print DoRestartNow(data)
260elif ( action == "parin"):
261   print GetPARIN(data)
262elif ( action == "rc"):
263   print GetRC(data) 
264else:
265   print "Error. Action " + action + " unknown."
266
Note: See TracBrowser for help on using the repository browser.