source: palm/trunk/SCRIPTS/palm_wdd @ 2061

Last change on this file since 2061 was 2061, checked in by maronga, 5 years ago

bugfix in PALM watchdog

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.9 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22# Bugfix: return proper error messages when .wdd.config is not found and for
23# wrong actions
24#
25# Former revisions:
26# -----------------
27# $Id: palm_wdd 2061 2016-11-10 14:53:31Z maronga $
28#
29# 1618 2015-07-13 06:52:15Z maronga
30# Added steering via configuration file, to be placed in the home directory of the
31# remote host to be monitored.
32#
33# 1613 2015-07-08 14:53:29Z maronga
34# Bugfix: tooltip for queuing name did not show up on first update.
35# New: added contect menu for showing the parameter file and the run control
36# output
37#
38# 1611 2015-07-07 12:23:22Z maronga
39# Initial revision
40#
41#
42# Description:
43# ------------
44# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
45# by the user. The watchdog server requires python 2.7 or higher installed on
46# host to be monitored.
47#
48# Instructions:
49# -------------
50# 1) Modify the header section of palm_wd
51# 2) Move .wd.olddata and .wd.newdata to your palm directory
52#    (e.g. /home/user/current_version/.wd.newdata etc.)
53# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
54#    respective hosts
55# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
56#
57# To do:
58# ------
59# 1) Add "Options", "Help" and "Manual"
60# 2) Move user settings to a configuration file
61#------------------------------------------------------------------------------!
62
63import ConfigParser
64import os
65from subprocess import check_output
66import sys
67import socket
68
69# Read configuration file
70# First check if the configuration file exists
71if ( os.path.exists('.wdd.config') == False ):
72    sys.stderr.write("No configuration file .wdd.config found on remote host %s." % socket.gethostname())
73    raise SystemExit     
74
75config = ConfigParser.RawConfigParser()
76config.read('.wdd.config')
77
78cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
79cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
80cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
81cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
82cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
83cmd_starttime      = config.get('Settings', 'starttime').strip('"')
84cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
85
86
87action   = str(sys.argv[1])
88data     = str(sys.argv[2])
89if ( len(sys.argv) > 3 ):
90   data2 =  str(sys.argv[3])
91
92
93cmd_readqueue = cmd_readqueue + " " + data
94cmd_tmpdir    = cmd_tmpdir + data
95
96# reading queuing system
97def ReadQueue(username):
98 
99#  collect queuing information
100   try:
101      out = check_output(cmd_readqueue, shell=True)
102      job_list = out.splitlines()
103      out = None
104#  do nothing for empty results list
105   except:
106      job_list = []
107 
108 
109   job_data_tmp = []
110   for j in range(0,len(job_list)):
111
112      # Write temporary data array containing the job information.
113      job_data_tmp.append(j)
114      job_data_tmp[j] = job_list[j].split(" ")
115      job_data_tmp[j] = filter(None, job_data_tmp[j])
116
117      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
118
119      # retrieve real job name for all jobs
120      try: 
121         out = check_output(cmd_realname, shell=True)
122         job_realname = out.split(" ")[1].rstrip()
123      except:
124         job_realname = "error"
125
126
127      # for running jobs, determine progress                 
128      if ( job_data_tmp[j][2] == "Running" ):
129 
130         # collect progress information
131         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
132         try: 
133
134            devnull = open(os.devnull, 'w')
135            out = check_output(cmd_progress, shell=True, stderr=devnull)
136            progress_lines = out.splitlines()
137            job_progress = progress_lines[1].split(" ")[1]
138            out = None
139         except:
140            job_progress = "0" 
141
142      else:
143         job_progress = "0" 
144
145      # return the job data
146      job_starttime = GetStartTime(job_data_tmp[j][0])
147      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
148
149
150# check details of specific job
151def CheckJob(jobid):
152
153   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
154
155   try: 
156      out = check_output(cmd_checkjob_tmp, shell=True)
157      job_details = out
158   except:
159      job_details = "No details available." 
160
161   return job_details
162
163
164# cancel a specific job
165def CancelJob(jobid):
166
167   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
168
169   try: 
170      out = check_output(cmd_canceljob_tmp, shell=True)
171      job_canceled = out
172   except:
173      job_canceled = "Action failed." 
174
175   return job_canceled
176
177
178# retrieve estimated start time of job
179def GetStartTime(jobid):
180
181   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
182
183   try: 
184      out = check_output(cmd_starttime_tmp, shell=True)
185      job_starttime = out.split()[5]
186   except:
187      job_starttime = "Action failed." 
188
189   return job_starttime
190
191
192def DoStopNow(username,jobid):
193
194   # collect progress information
195   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
196   try: 
197      devnull = open(os.devnull, 'w')
198      out = check_output(cmd_dostop, shell=True, stderr=devnull)
199      out = None
200   except:
201      return_message = "Action failed." 
202      return return_message
203
204def DoRestartNow(username,jobid):
205
206   # collect progress information
207   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
208   try: 
209      devnull = open(os.devnull, 'w')
210      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
211      out = None
212   except:
213      return_message = "Action failed." 
214      return return_message
215
216def GetPARIN(username,jobid):
217
218   # collect progress information
219   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/PARIN"
220   try: 
221      devnull = open(os.devnull, 'w')
222      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
223      return_message = out
224      out = None
225   except:
226      return_message = "Action failed." 
227     
228   return return_message
229
230def GetRC(username,jobid):
231
232   # collect progress information
233   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/RUN_CONTROL"
234   try: 
235      devnull = open(os.devnull, 'w')
236      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
237      return_message = out
238      out = None
239   except:
240      return_message = "Action failed." 
241     
242   return return_message
243
244# START OF MAIN
245if ( action == "queue" ):
246   ReadQueue(data)
247elif ( action == "check"):
248   print CheckJob(data)
249elif ( action == "cancel"):
250   print CancelJob(data)
251elif ( action == "start"):
252   print GetStartTime(data)
253elif ( action == "stop"):
254   print DoStopNow(data,data2)
255elif ( action == "restart"):
256   print DoRestartNow(data,data2)
257elif ( action == "parin"):
258   print GetPARIN(data,data2)
259elif ( action == "rc"):
260   print GetRC(data,data2) 
261else:
262   sys.stderr.write("Unknown action on remote host %s." % hostname)
263
Note: See TracBrowser for help on using the repository browser.