source: palm/trunk/SCRIPTS/palm_wdd @ 2062

Last change on this file since 2062 was 2062, checked in by maronga, 5 years ago

last commit documented

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.9 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
23#
24# Former revisions:
25# -----------------
26# $Id: palm_wdd 2062 2016-11-10 14:54:13Z maronga $
27#
28# Bugfix: return proper error messages when .wdd.config is not found and for
29# wrong actions
30#
31# 1618 2015-07-13 06:52:15Z maronga
32# Added steering via configuration file, to be placed in the home directory of the
33# remote host to be monitored.
34#
35# 1613 2015-07-08 14:53:29Z maronga
36# Bugfix: tooltip for queuing name did not show up on first update.
37# New: added contect menu for showing the parameter file and the run control
38# output
39#
40# 1611 2015-07-07 12:23:22Z maronga
41# Initial revision
42#
43#
44# Description:
45# ------------
46# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
47# by the user. The watchdog server requires python 2.7 or higher installed on
48# host to be monitored.
49#
50# Instructions:
51# -------------
52# 1) Modify the header section of palm_wd
53# 2) Move .wd.olddata and .wd.newdata to your palm directory
54#    (e.g. /home/user/current_version/.wd.newdata etc.)
55# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
56#    respective hosts
57# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
58#
59# To do:
60# ------
61# 1) Add "Options", "Help" and "Manual"
62# 2) Move user settings to a configuration file
63#------------------------------------------------------------------------------!
64
65import ConfigParser
66import os
67from subprocess import check_output
68import sys
69import socket
70
71# Read configuration file
72# First check if the configuration file exists
73if ( os.path.exists('.wdd.config') == False ):
74    sys.stderr.write("No configuration file .wdd.config found on remote host %s." % socket.gethostname())
75    raise SystemExit     
76
77config = ConfigParser.RawConfigParser()
78config.read('.wdd.config')
79
80cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
81cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
82cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
83cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
84cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
85cmd_starttime      = config.get('Settings', 'starttime').strip('"')
86cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
87
88
89action   = str(sys.argv[1])
90data     = str(sys.argv[2])
91if ( len(sys.argv) > 3 ):
92   data2 =  str(sys.argv[3])
93
94
95cmd_readqueue = cmd_readqueue + " " + data
96cmd_tmpdir    = cmd_tmpdir + data
97
98# reading queuing system
99def ReadQueue(username):
100 
101#  collect queuing information
102   try:
103      out = check_output(cmd_readqueue, shell=True)
104      job_list = out.splitlines()
105      out = None
106#  do nothing for empty results list
107   except:
108      job_list = []
109 
110 
111   job_data_tmp = []
112   for j in range(0,len(job_list)):
113
114      # Write temporary data array containing the job information.
115      job_data_tmp.append(j)
116      job_data_tmp[j] = job_list[j].split(" ")
117      job_data_tmp[j] = filter(None, job_data_tmp[j])
118
119      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
120
121      # retrieve real job name for all jobs
122      try: 
123         out = check_output(cmd_realname, shell=True)
124         job_realname = out.split(" ")[1].rstrip()
125      except:
126         job_realname = "error"
127
128
129      # for running jobs, determine progress                 
130      if ( job_data_tmp[j][2] == "Running" ):
131 
132         # collect progress information
133         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
134         try: 
135
136            devnull = open(os.devnull, 'w')
137            out = check_output(cmd_progress, shell=True, stderr=devnull)
138            progress_lines = out.splitlines()
139            job_progress = progress_lines[1].split(" ")[1]
140            out = None
141         except:
142            job_progress = "0" 
143
144      else:
145         job_progress = "0" 
146
147      # return the job data
148      job_starttime = GetStartTime(job_data_tmp[j][0])
149      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
150
151
152# check details of specific job
153def CheckJob(jobid):
154
155   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
156
157   try: 
158      out = check_output(cmd_checkjob_tmp, shell=True)
159      job_details = out
160   except:
161      job_details = "No details available." 
162
163   return job_details
164
165
166# cancel a specific job
167def CancelJob(jobid):
168
169   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
170
171   try: 
172      out = check_output(cmd_canceljob_tmp, shell=True)
173      job_canceled = out
174   except:
175      job_canceled = "Action failed." 
176
177   return job_canceled
178
179
180# retrieve estimated start time of job
181def GetStartTime(jobid):
182
183   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
184
185   try: 
186      out = check_output(cmd_starttime_tmp, shell=True)
187      job_starttime = out.split()[5]
188   except:
189      job_starttime = "Action failed." 
190
191   return job_starttime
192
193
194def DoStopNow(username,jobid):
195
196   # collect progress information
197   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
198   try: 
199      devnull = open(os.devnull, 'w')
200      out = check_output(cmd_dostop, shell=True, stderr=devnull)
201      out = None
202   except:
203      return_message = "Action failed." 
204      return return_message
205
206def DoRestartNow(username,jobid):
207
208   # collect progress information
209   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
210   try: 
211      devnull = open(os.devnull, 'w')
212      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
213      out = None
214   except:
215      return_message = "Action failed." 
216      return return_message
217
218def GetPARIN(username,jobid):
219
220   # collect progress information
221   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/PARIN"
222   try: 
223      devnull = open(os.devnull, 'w')
224      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
225      return_message = out
226      out = None
227   except:
228      return_message = "Action failed." 
229     
230   return return_message
231
232def GetRC(username,jobid):
233
234   # collect progress information
235   cmd_dorestart = "cat " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/RUN_CONTROL"
236   try: 
237      devnull = open(os.devnull, 'w')
238      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
239      return_message = out
240      out = None
241   except:
242      return_message = "Action failed." 
243     
244   return return_message
245
246# START OF MAIN
247if ( action == "queue" ):
248   ReadQueue(data)
249elif ( action == "check"):
250   print CheckJob(data)
251elif ( action == "cancel"):
252   print CancelJob(data)
253elif ( action == "start"):
254   print GetStartTime(data)
255elif ( action == "stop"):
256   print DoStopNow(data,data2)
257elif ( action == "restart"):
258   print DoRestartNow(data,data2)
259elif ( action == "parin"):
260   print GetPARIN(data,data2)
261elif ( action == "rc"):
262   print GetRC(data,data2) 
263else:
264   sys.stderr.write("Unknown action on remote host %s." % hostname)
265
Note: See TracBrowser for help on using the repository browser.