source: palm/trunk/SCRIPTS/palm_wdd @ 1786

Last change on this file since 1786 was 1619, checked in by maronga, 10 years ago

last commit documented

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.7 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
23#
24# Former revisions:
25# -----------------
26# $Id: palm_wdd 1619 2015-07-13 06:53:19Z raasch $
27#
28# 1618 2015-07-13 06:52:15Z maronga
29# Added steering via configuration file, to be placed in the home directory of the
30# remote host to be monitored.
31#
32# 1613 2015-07-08 14:53:29Z maronga
33# Bugfix: tooltip for queuing name did not show up on first update.
34# New: added contect menu for showing the parameter file and the run control
35# output
36#
37# 1611 2015-07-07 12:23:22Z maronga
38# Initial revision
39#
40#
41# Description:
42# ------------
43# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
44# by the user. The watchdog server requires python 2.7 or higher installed on
45# host to be monitored.
46#
47# Instructions:
48# -------------
49# 1) Modify the header section of palm_wd
50# 2) Move .wd.olddata and .wd.newdata to your palm directory
51#    (e.g. /home/user/current_version/.wd.newdata etc.)
52# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
53#    respective hosts
54# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
55#
56# To do:
57# ------
58# 1) Add "Options", "Help" and "Manual"
59# 2) Move user settings to a configuration file
60#------------------------------------------------------------------------------!
61
62import ConfigParser
63import os
64from subprocess import check_output
65import sys
66
67
68# Read configuration file
69# First check if the configuration file exists
70if ( os.path.exists('.wdd.config') == False ):
71    print "Error. No configuration file .wdd.config found."
72    raise SystemExit     
73
74config = ConfigParser.RawConfigParser()
75config.read('.wdd.config')
76
77cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
78cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
79cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
80cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
81cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
82cmd_starttime      = config.get('Settings', 'starttime').strip('"')
83cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
84
85
86action   = str(sys.argv[1])
87data     = str(sys.argv[2])
88if ( len(sys.argv) > 3 ):
89   data2 =  str(sys.argv[3])
90
91
92cmd_readqueue = cmd_readqueue + " " + data
93cmd_tmpdir    = cmd_tmpdir + data
94
95# reading queuing system
96def ReadQueue(username):
97 
98#  collect queuing information
99   try:
100      out = check_output(cmd_readqueue, shell=True)
101      job_list = out.splitlines()
102      out = None
103#  do nothing for empty results list
104   except:
105      job_list = []
106 
107 
108   job_data_tmp = []
109   for j in range(0,len(job_list)):
110
111      # Write temporary data array containing the job information.
112      job_data_tmp.append(j)
113      job_data_tmp[j] = job_list[j].split(" ")
114      job_data_tmp[j] = filter(None, job_data_tmp[j])
115
116      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
117
118      # retrieve real job name for all jobs
119      try: 
120         out = check_output(cmd_realname, shell=True)
121         job_realname = out.split(" ")[1].rstrip()
122      except:
123         job_realname = "error"
124
125
126      # for running jobs, determine progress                 
127      if ( job_data_tmp[j][2] == "Running" ):
128 
129         # collect progress information
130         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
131         try: 
132
133            devnull = open(os.devnull, 'w')
134            out = check_output(cmd_progress, shell=True, stderr=devnull)
135            progress_lines = out.splitlines()
136            job_progress = progress_lines[1].split(" ")[1]
137            out = None
138         except:
139            job_progress = "0" 
140
141      else:
142         job_progress = "0" 
143
144      # return the job data
145      job_starttime = GetStartTime(job_data_tmp[j][0])
146      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
147
148
149# check details of specific job
150def CheckJob(jobid):
151
152   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
153
154   try: 
155      out = check_output(cmd_checkjob_tmp, shell=True)
156      job_details = out
157   except:
158      job_details = "No details available." 
159
160   return job_details
161
162
163# cancel a specific job
164def CancelJob(jobid):
165
166   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
167
168   try: 
169      out = check_output(cmd_canceljob_tmp, shell=True)
170      job_canceled = out
171   except:
172      job_canceled = "Action failed." 
173
174   return job_canceled
175
176
177# retrieve estimated start time of job
178def GetStartTime(jobid):
179
180   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
181
182   try: 
183      out = check_output(cmd_starttime_tmp, shell=True)
184      job_starttime = out.split()[5]
185   except:
186      job_starttime = "Action failed." 
187
188   return job_starttime
189
190
191def DoStopNow(username,jobid):
192
193   # collect progress information
194   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
195   try: 
196      devnull = open(os.devnull, 'w')
197      out = check_output(cmd_dostop, shell=True, stderr=devnull)
198      out = None
199   except:
200      return_message = "Action failed." 
201      return return_message
202
203def DoRestartNow(username,jobid):
204
205   # collect progress information
206   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
207   try: 
208      devnull = open(os.devnull, 'w')
209      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
210      out = None
211   except:
212      return_message = "Action failed." 
213      return return_message
214
215def GetPARIN(username,jobid):
216
217   # collect progress information
218   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/PARIN"
219   try: 
220      devnull = open(os.devnull, 'w')
221      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
222      return_message = out
223      out = None
224   except:
225      return_message = "Action failed." 
226     
227   return return_message
228
229def GetRC(username,jobid):
230
231   # collect progress information
232   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/RUN_CONTROL"
233   try: 
234      devnull = open(os.devnull, 'w')
235      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
236      return_message = out
237      out = None
238   except:
239      return_message = "Action failed." 
240     
241   return return_message
242
243# START OF MAIN
244if ( action == "queue" ):
245   ReadQueue(data)
246elif ( action == "check"):
247   print CheckJob(data)
248elif ( action == "cancel"):
249   print CancelJob(data)
250elif ( action == "start"):
251   print GetStartTime(data)
252elif ( action == "stop"):
253   print DoStopNow(data,data2)
254elif ( action == "restart"):
255   print DoRestartNow(data,data2)
256elif ( action == "parin"):
257   print GetPARIN(data,data2)
258elif ( action == "rc"):
259   print GetRC(data,data2) 
260else:
261   print "Error. Action " + action + " unknown."
262
Note: See TracBrowser for help on using the repository browser.