source: palm/trunk/SCRIPTS/palm_wdd @ 1614

Last change on this file since 1614 was 1614, checked in by maronga, 9 years ago

last commit documented

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.1 KB
RevLine 
[1611]1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
[1614]23#
[1611]24# Former revisions:
25# -----------------
26# $Id: palm_wdd 1614 2015-07-08 14:54:35Z maronga $
27#
[1614]28# 1613 2015-07-08 14:53:29Z maronga
29# Bugfix: tooltip for queuing name did not show up on first update.
30# New: added contect menu for showing the parameter file and the run control
31# output
32#
[1612]33# 1611 2015-07-07 12:23:22Z maronga
34# Initial revision
35#
[1611]36#
37# Description:
38# ------------
39# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
40# by the user. The watchdog server requires python 2.7 or higher installed on
41# host to be monitored.
42#
43# Instructions:
44# -------------
45# 1) Modify the header section of palm_wd
46# 2) Move .wd.olddata and .wd.newdata to your palm directory
47#    (e.g. /home/user/current_version/.wd.newdata etc.)
48# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
49#    respective hosts
50# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
51#
52# To do:
53# ------
54# 1) Add "Options", "Help" and "Manual"
55# 2) Move user settings to a configuration file
56#------------------------------------------------------------------------------!
57
58import os
59import sys
60from subprocess import check_output
61
62# START OF HEADER
63
64# configuration for host
65cmd_readqueue      = "showq | egrep "
66cmd_tmpdir         = "/gfs1/tmp/"
67cmd_canceljob      = "canceljob"
68cmd_checkjob       = "checkjob"
69cmd_realname_grep  = "AName"
70cmd_starttime      = "showstart"
71cmd_starttime_grep = "start in"
72
73# END OF HEADER
74
75
76action   = str(sys.argv[1])
77data     = str(sys.argv[2])
78if ( len(sys.argv) > 3 ):
79   data2 =  str(sys.argv[3])
80
81
82cmd_readqueue = cmd_readqueue + data
83cmd_tmpdir    = cmd_tmpdir + data
84
85
86
87# reading queuing system
88def ReadQueue(username):
89 
90#  collect queuing information
91   try:
92      out = check_output(cmd_readqueue, shell=True)
93      job_list = out.splitlines()
94      out = None
95#  do nothing for empty results list
96   except:
97      job_list = []
98 
99 
100   job_data_tmp = []
101   for j in range(0,len(job_list)):
102
103      # Write temporary data array containing the job information.
104      job_data_tmp.append(j)
105      job_data_tmp[j] = job_list[j].split(" ")
106      job_data_tmp[j] = filter(None, job_data_tmp[j])
107
108      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
109
110      # retrieve real job name for all jobs
111      try: 
112         out = check_output(cmd_realname, shell=True)
113         job_realname = out.split(" ")[1].rstrip()
114      except:
115         job_realname = "error"
116
117
118      # for running jobs, determine progress                 
119      if ( job_data_tmp[j][2] == "Running" ):
120 
121         # collect progress information
122         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
123         try: 
124
125            devnull = open(os.devnull, 'w')
126            out = check_output(cmd_progress, shell=True, stderr=devnull)
127            progress_lines = out.splitlines()
128            job_progress = progress_lines[1].split(" ")[1]
129            out = None
130         except:
131            job_progress = "0" 
132
133      else:
134         job_progress = "0" 
135
136      # return the job data
137      job_starttime = GetStartTime(job_data_tmp[j][0])
138      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
139
140
141# check details of specific job
142def CheckJob(jobid):
143
144   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
145
146   try: 
147      out = check_output(cmd_checkjob_tmp, shell=True)
148      job_details = out
149   except:
150      job_details = "No details available." 
151
152   return job_details
153
154
155# cancel a specific job
156def CancelJob(jobid):
157
158   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
159
160   try: 
161      out = check_output(cmd_canceljob_tmp, shell=True)
162      job_canceled = out
163   except:
164      job_canceled = "Action failed." 
165
166   return job_canceled
167
168
169# retrieve estimated start time of job
170def GetStartTime(jobid):
171
172   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
173
174   try: 
175      out = check_output(cmd_starttime_tmp, shell=True)
176      job_starttime = out.split()[5]
177   except:
178      job_starttime = "Action failed." 
179
180   return job_starttime
181
182
183def DoStopNow(username,jobid):
184
185   # collect progress information
186   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
187   try: 
188      devnull = open(os.devnull, 'w')
189      out = check_output(cmd_dostop, shell=True, stderr=devnull)
190      out = None
191   except:
192      return_message = "Action failed." 
193      return return_message
194
195def DoRestartNow(username,jobid):
196
197   # collect progress information
198   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
199   try: 
200      devnull = open(os.devnull, 'w')
201      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
202      out = None
203   except:
204      return_message = "Action failed." 
205      return return_message
206
[1613]207def GetPARIN(username,jobid):
[1611]208
[1613]209   # collect progress information
210   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/PARIN"
211   try: 
212      devnull = open(os.devnull, 'w')
213      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
214      return_message = out
215      out = None
216   except:
217      return_message = "Action failed." 
218     
219   return return_message
220
221def GetRC(username,jobid):
222
223   # collect progress information
224   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/RUN_CONTROL"
225   try: 
226      devnull = open(os.devnull, 'w')
227      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
228      return_message = out
229      out = None
230   except:
231      return_message = "Action failed." 
232     
233   return return_message
234
[1611]235# START OF MAIN
236if ( action == "queue" ):
237   ReadQueue(data)
238elif ( action == "check"):
239   print CheckJob(data)
240elif ( action == "cancel"):
241   print CancelJob(data)
242elif ( action == "start"):
243   print GetStartTime(data)
244elif ( action == "stop"):
245   print DoStopNow(data,data2)
246elif ( action == "restart"):
247   print DoRestartNow(data,data2)
[1613]248elif ( action == "parin"):
249   print GetPARIN(data,data2)
250elif ( action == "rc"):
251   print GetRC(data,data2) 
[1611]252else:
253   print "Error. Action " + action + " unknown."
254
Note: See TracBrowser for help on using the repository browser.