source: palm/trunk/SCRIPTS/palm_wdd @ 1613

Last change on this file since 1613 was 1613, checked in by maronga, 6 years ago

bugfix in install_rrtmg, removed nc2vdf from mbuild, improved palm_wd

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.0 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22# Bugfix: tooltip for queuing name did not show up on first update.
23# New: added contect menu for showing the parameter file and the run control
24# output
25#
26# Former revisions:
27# -----------------
28# $Id: palm_wdd 1613 2015-07-08 14:53:29Z maronga $
29#
30# 1611 2015-07-07 12:23:22Z maronga
31# Initial revision
32#
33#
34# Description:
35# ------------
36# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
37# by the user. The watchdog server requires python 2.7 or higher installed on
38# host to be monitored.
39#
40# Instructions:
41# -------------
42# 1) Modify the header section of palm_wd
43# 2) Move .wd.olddata and .wd.newdata to your palm directory
44#    (e.g. /home/user/current_version/.wd.newdata etc.)
45# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
46#    respective hosts
47# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
48#
49# To do:
50# ------
51# 1) Add "Options", "Help" and "Manual"
52# 2) Move user settings to a configuration file
53#------------------------------------------------------------------------------!
54
55import os
56import sys
57from subprocess import check_output
58
59# START OF HEADER
60
61# configuration for host
62cmd_readqueue      = "showq | egrep "
63cmd_tmpdir         = "/gfs1/tmp/"
64cmd_canceljob      = "canceljob"
65cmd_checkjob       = "checkjob"
66cmd_realname_grep  = "AName"
67cmd_starttime      = "showstart"
68cmd_starttime_grep = "start in"
69
70# END OF HEADER
71
72
73action   = str(sys.argv[1])
74data     = str(sys.argv[2])
75if ( len(sys.argv) > 3 ):
76   data2 =  str(sys.argv[3])
77
78
79cmd_readqueue = cmd_readqueue + data
80cmd_tmpdir    = cmd_tmpdir + data
81
82
83
84# reading queuing system
85def ReadQueue(username):
86 
87#  collect queuing information
88   try:
89      out = check_output(cmd_readqueue, shell=True)
90      job_list = out.splitlines()
91      out = None
92#  do nothing for empty results list
93   except:
94      job_list = []
95 
96 
97   job_data_tmp = []
98   for j in range(0,len(job_list)):
99
100      # Write temporary data array containing the job information.
101      job_data_tmp.append(j)
102      job_data_tmp[j] = job_list[j].split(" ")
103      job_data_tmp[j] = filter(None, job_data_tmp[j])
104
105      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
106
107      # retrieve real job name for all jobs
108      try: 
109         out = check_output(cmd_realname, shell=True)
110         job_realname = out.split(" ")[1].rstrip()
111      except:
112         job_realname = "error"
113
114
115      # for running jobs, determine progress                 
116      if ( job_data_tmp[j][2] == "Running" ):
117 
118         # collect progress information
119         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
120         try: 
121
122            devnull = open(os.devnull, 'w')
123            out = check_output(cmd_progress, shell=True, stderr=devnull)
124            progress_lines = out.splitlines()
125            job_progress = progress_lines[1].split(" ")[1]
126            out = None
127         except:
128            job_progress = "0" 
129
130      else:
131         job_progress = "0" 
132
133      # return the job data
134      job_starttime = GetStartTime(job_data_tmp[j][0])
135      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
136
137
138# check details of specific job
139def CheckJob(jobid):
140
141   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
142
143   try: 
144      out = check_output(cmd_checkjob_tmp, shell=True)
145      job_details = out
146   except:
147      job_details = "No details available." 
148
149   return job_details
150
151
152# cancel a specific job
153def CancelJob(jobid):
154
155   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
156
157   try: 
158      out = check_output(cmd_canceljob_tmp, shell=True)
159      job_canceled = out
160   except:
161      job_canceled = "Action failed." 
162
163   return job_canceled
164
165
166# retrieve estimated start time of job
167def GetStartTime(jobid):
168
169   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
170
171   try: 
172      out = check_output(cmd_starttime_tmp, shell=True)
173      job_starttime = out.split()[5]
174   except:
175      job_starttime = "Action failed." 
176
177   return job_starttime
178
179
180def DoStopNow(username,jobid):
181
182   # collect progress information
183   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
184   try: 
185      devnull = open(os.devnull, 'w')
186      out = check_output(cmd_dostop, shell=True, stderr=devnull)
187      out = None
188   except:
189      return_message = "Action failed." 
190      return return_message
191
192def DoRestartNow(username,jobid):
193
194   # collect progress information
195   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
196   try: 
197      devnull = open(os.devnull, 'w')
198      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
199      out = None
200   except:
201      return_message = "Action failed." 
202      return return_message
203
204def GetPARIN(username,jobid):
205
206   # collect progress information
207   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/PARIN"
208   try: 
209      devnull = open(os.devnull, 'w')
210      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
211      return_message = out
212      out = None
213   except:
214      return_message = "Action failed." 
215     
216   return return_message
217
218def GetRC(username,jobid):
219
220   # collect progress information
221   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/RUN_CONTROL"
222   try: 
223      devnull = open(os.devnull, 'w')
224      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
225      return_message = out
226      out = None
227   except:
228      return_message = "Action failed." 
229     
230   return return_message
231
232# START OF MAIN
233if ( action == "queue" ):
234   ReadQueue(data)
235elif ( action == "check"):
236   print CheckJob(data)
237elif ( action == "cancel"):
238   print CancelJob(data)
239elif ( action == "start"):
240   print GetStartTime(data)
241elif ( action == "stop"):
242   print DoStopNow(data,data2)
243elif ( action == "restart"):
244   print DoRestartNow(data,data2)
245elif ( action == "parin"):
246   print GetPARIN(data,data2)
247elif ( action == "rc"):
248   print GetRC(data,data2) 
249else:
250   print "Error. Action " + action + " unknown."
251
Note: See TracBrowser for help on using the repository browser.