source: palm/trunk/SCRIPTS/palm_wdd @ 2798

Last change on this file since 2798 was 2718, checked in by maronga, 7 years ago

deleting of deprecated files; headers updated where needed

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.8 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of the PALM model system.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2018  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
23#
24# Former revisions:
25# -----------------
26# $Id: palm_wdd 2718 2018-01-02 08:49:38Z suehring $
27# Corrected "Former revisions" section
28#
29# 2696 2017-12-14 17:12:51Z kanani
30# Change in file header (GPL part)
31#
32# 2421 2017-09-07 10:36:34Z maronga
33# Fixed display of job progress.
34#
35# 2416 2017-09-06 14:28:14Z maronga
36# Adapted for palmrun
37#
38# 1619 2015-07-13 06:53:19Z maronga
39#
40# 1618 2015-07-13 06:52:15Z maronga
41# Added steering via configuration file, to be placed in the home directory of the
42# remote host to be monitored.
43#
44# 1613 2015-07-08 14:53:29Z maronga
45# Bugfix: tooltip for queuing name did not show up on first update.
46# New: added contect menu for showing the parameter file and the run control
47# output
48#
49# 1611 2015-07-07 12:23:22Z maronga
50# Initial revision
51#
52#
53# Description:
54# ------------
55# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
56# by the user. The watchdog server requires python 2.7 or higher installed on
57# host to be monitored.
58#
59# Instructions:
60# -------------
61# 1) Modify the header section of palm_wd
62# 2) Move .wd.olddata and .wd.newdata to your palm directory
63#    (e.g. /home/user/current_version/.wd.newdata etc.)
64# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
65#    respective hosts
66# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
67#
68# To do:
69# ------
70# 1) Add "Options", "Help" and "Manual"
71# 2) Move user settings to a configuration file
72#------------------------------------------------------------------------------!
73
74import ConfigParser
75import os
76import pwd
77from subprocess import check_output
78import sys
79
80
81# Read configuration file
82# First check if the configuration file exists
83if ( os.path.exists('.wdd.config') == False ):
84    print "Error. No configuration file .wdd.config found."
85    raise SystemExit     
86
87config = ConfigParser.RawConfigParser()
88config.read('.wdd.config')
89
90cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
91cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
92cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
93cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
94cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
95cmd_starttime      = config.get('Settings', 'starttime').strip('"')
96cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
97
98
99action   = str(sys.argv[1])
100data     = str(sys.argv[2])
101
102cmd_readqueue = cmd_readqueue + " " + pwd.getpwuid( os.getuid() )[ 0 ]
103cmd_tmpdir    = cmd_tmpdir + pwd.getpwuid( os.getuid() )[ 0 ]
104
105# reading queuing system
106def ReadQueue(username):
107 
108#  collect queuing information
109   try:
110      out = check_output(cmd_readqueue, shell=True)
111      job_list = out.splitlines()
112      out = None
113#  do nothing for empty results list
114   except:
115      job_list = []
116 
117 
118   job_data_tmp = []
119   for j in range(0,len(job_list)):
120
121      # Write temporary data array containing the job information.
122      job_data_tmp.append(j)
123      job_data_tmp[j] = job_list[j].split(" ")
124      job_data_tmp[j] = filter(None, job_data_tmp[j])
125
126      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
127
128      # retrieve real job name for all jobs
129      try: 
130         out = check_output(cmd_realname, shell=True)
131         job_realname = out.split(" ")[1].rstrip()
132      except:
133         job_realname = "error"
134
135
136      # for running jobs, determine progress                 
137      if ( job_data_tmp[j][2] == "Running" ):
138 
139         # collect progress information
140         cmd_progress = "cat " + cmd_tmpdir + "/" + job_realname + "/PROGRESS"
141         try: 
142
143            devnull = open(os.devnull, 'w')
144            out = check_output(cmd_progress, shell=True, stderr=devnull)
145            progress_lines = out.splitlines()
146            job_progress = progress_lines[1].split(" ")[3]
147            out = None
148         except:
149            job_progress = "0" 
150
151      else:
152         job_progress = "0" 
153
154      # return the job data
155      job_starttime = GetStartTime(job_data_tmp[j][0])
156      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
157
158
159# check details of specific job
160def CheckJob(jobid):
161
162   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
163
164   try: 
165      out = check_output(cmd_checkjob_tmp, shell=True)
166      job_details = out
167   except:
168      job_details = "No details available." 
169
170   return job_details
171
172
173# cancel a specific job
174def CancelJob(jobid):
175
176   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
177
178   try: 
179      out = check_output(cmd_canceljob_tmp, shell=True)
180      job_canceled = out
181   except:
182      job_canceled = "Action failed." 
183
184   return job_canceled
185
186
187# retrieve estimated start time of job
188def GetStartTime(jobid):
189
190   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
191
192   try: 
193      out = check_output(cmd_starttime_tmp, shell=True)
194      job_starttime = out.split()[5]
195   except:
196      job_starttime = "Action failed." 
197
198   return job_starttime
199
200
201def DoStopNow(jobid):
202
203   # collect progress information
204   cmd_dostop = "touch " + cmd_tmpdir + "/" + jobid + "/DO_STOP_NOW"
205   try: 
206      devnull = open(os.devnull, 'w')
207      out = check_output(cmd_dostop, shell=True, stderr=devnull)
208      out = None
209   except:
210      return_message = "Action failed." 
211      return return_message
212
213def DoRestartNow(jobid):
214
215   # collect progress information
216   cmd_dorestart = "touch " + cmd_tmpdir + "/" + jobid + "/DO_RESTART_NOW"
217   try: 
218      devnull = open(os.devnull, 'w')
219      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
220      out = None
221   except:
222      return_message = "Action failed." 
223      return return_message
224
225def GetPARIN(jobid):
226
227   # collect progress information
228   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/PARIN"
229   try: 
230      devnull = open(os.devnull, 'w')
231      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
232      return_message = out
233      out = None
234   except:
235      return_message = "Action failed."   + "cat " + cmd_tmpdir + "/" + jobid + "/PARIN" 
236     
237   return return_message
238
239def GetRC(jobid):
240
241   # collect progress information
242   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/RUN_CONTROL"
243   try: 
244      devnull = open(os.devnull, 'w')
245      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
246      return_message = out
247      out = None
248   except:
249      return_message = "Action failed."
250     
251   return return_message
252
253# START OF MAIN
254if ( action == "queue" ):
255   ReadQueue(data)
256elif ( action == "check"):
257   print CheckJob(data)
258elif ( action == "cancel"):
259   print CancelJob(data)
260elif ( action == "start"):
261   print GetStartTime(data)
262elif ( action == "stop"):
263   print DoStopNow(data)
264elif ( action == "restart"):
265   print DoRestartNow(data)
266elif ( action == "parin"):
267   print GetPARIN(data)
268elif ( action == "rc"):
269   print GetRC(data) 
270else:
271   print "Error. Action " + action + " unknown."
272
Note: See TracBrowser for help on using the repository browser.