source: palm/trunk/SCRIPTS/palm_wdd @ 4875

Last change on this file since 4875 was 4843, checked in by raasch, 4 years ago

local namelist parameter added to switch off the module although the respective module namelist appears in the namelist file, further copyright updates

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.9 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of the PALM model system.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2021  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
23#
24# Former revisions:
25# -----------------
26# $Id: palm_wdd 4843 2021-01-15 15:22:11Z scharf $
27# Modified header
28#
29# 2718 2018-01-02 08:49:38Z maronga
30# Corrected "Former revisions" section
31#
32# 2696 2017-12-14 17:12:51Z kanani
33# Change in file header (GPL part)
34#
35# 2421 2017-09-07 10:36:34Z maronga
36# Fixed display of job progress.
37#
38# 2416 2017-09-06 14:28:14Z maronga
39# Adapted for palmrun
40#
41# 1619 2015-07-13 06:53:19Z maronga
42#
43# 1618 2015-07-13 06:52:15Z maronga
44# Added steering via configuration file, to be placed in the home directory of the
45# remote host to be monitored.
46#
47# 1613 2015-07-08 14:53:29Z maronga
48# Bugfix: tooltip for queuing name did not show up on first update.
49# New: added contect menu for showing the parameter file and the run control
50# output
51#
52# 1611 2015-07-07 12:23:22Z maronga
53# Initial revision
54#
55#
56# Description:
57# ------------
58# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
59# by the user. The watchdog server requires python 2.7 or higher installed on
60# host to be monitored.
61#
62# Instructions:
63# -------------
64# 1) Modify the header section of palm_wd
65# 2) Move .wd.olddata and .wd.newdata to your palm directory
66#    (e.g. /home/user/current_version/.wd.newdata etc.)
67# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
68#    respective hosts
69# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
70#
71# To do:
72# ------
73# 1) Add "Options", "Help" and "Manual"
74# 2) Move user settings to a configuration file
75#------------------------------------------------------------------------------!
76
77import ConfigParser
78import os
79import pwd
80from subprocess import check_output
81import sys
82
83
84# Read configuration file
85# First check if the configuration file exists
86if ( os.path.exists('.wdd.config') == False ):
87    print "Error. No configuration file .wdd.config found."
88    raise SystemExit     
89
90config = ConfigParser.RawConfigParser()
91config.read('.wdd.config')
92
93cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
94cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
95cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
96cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
97cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
98cmd_starttime      = config.get('Settings', 'starttime').strip('"')
99cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
100
101
102action   = str(sys.argv[1])
103data     = str(sys.argv[2])
104
105cmd_readqueue = cmd_readqueue + " " + pwd.getpwuid( os.getuid() )[ 0 ]
106cmd_tmpdir    = cmd_tmpdir + pwd.getpwuid( os.getuid() )[ 0 ]
107
108# reading queuing system
109def ReadQueue(username):
110 
111#  collect queuing information
112   try:
113      out = check_output(cmd_readqueue, shell=True)
114      job_list = out.splitlines()
115      out = None
116#  do nothing for empty results list
117   except:
118      job_list = []
119 
120 
121   job_data_tmp = []
122   for j in range(0,len(job_list)):
123
124      # Write temporary data array containing the job information.
125      job_data_tmp.append(j)
126      job_data_tmp[j] = job_list[j].split(" ")
127      job_data_tmp[j] = filter(None, job_data_tmp[j])
128
129      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
130
131      # retrieve real job name for all jobs
132      try: 
133         out = check_output(cmd_realname, shell=True)
134         job_realname = out.split(" ")[1].rstrip()
135      except:
136         job_realname = "error"
137
138
139      # for running jobs, determine progress                 
140      if ( job_data_tmp[j][2] == "Running" ):
141 
142         # collect progress information
143         cmd_progress = "cat " + cmd_tmpdir + "/" + job_realname + "/PROGRESS"
144         try: 
145
146            devnull = open(os.devnull, 'w')
147            out = check_output(cmd_progress, shell=True, stderr=devnull)
148            progress_lines = out.splitlines()
149            job_progress = progress_lines[1].split(" ")[3]
150            out = None
151         except:
152            job_progress = "0" 
153
154      else:
155         job_progress = "0" 
156
157      # return the job data
158      job_starttime = GetStartTime(job_data_tmp[j][0])
159      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
160
161
162# check details of specific job
163def CheckJob(jobid):
164
165   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
166
167   try: 
168      out = check_output(cmd_checkjob_tmp, shell=True)
169      job_details = out
170   except:
171      job_details = "No details available." 
172
173   return job_details
174
175
176# cancel a specific job
177def CancelJob(jobid):
178
179   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
180
181   try: 
182      out = check_output(cmd_canceljob_tmp, shell=True)
183      job_canceled = out
184   except:
185      job_canceled = "Action failed." 
186
187   return job_canceled
188
189
190# retrieve estimated start time of job
191def GetStartTime(jobid):
192
193   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
194
195   try: 
196      out = check_output(cmd_starttime_tmp, shell=True)
197      job_starttime = out.split()[5]
198   except:
199      job_starttime = "Action failed." 
200
201   return job_starttime
202
203
204def DoStopNow(jobid):
205
206   # collect progress information
207   cmd_dostop = "touch " + cmd_tmpdir + "/" + jobid + "/DO_STOP_NOW"
208   try: 
209      devnull = open(os.devnull, 'w')
210      out = check_output(cmd_dostop, shell=True, stderr=devnull)
211      out = None
212   except:
213      return_message = "Action failed." 
214      return return_message
215
216def DoRestartNow(jobid):
217
218   # collect progress information
219   cmd_dorestart = "touch " + cmd_tmpdir + "/" + jobid + "/DO_RESTART_NOW"
220   try: 
221      devnull = open(os.devnull, 'w')
222      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
223      out = None
224   except:
225      return_message = "Action failed." 
226      return return_message
227
228def GetPARIN(jobid):
229
230   # collect progress information
231   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/PARIN"
232   try: 
233      devnull = open(os.devnull, 'w')
234      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
235      return_message = out
236      out = None
237   except:
238      return_message = "Action failed."   + "cat " + cmd_tmpdir + "/" + jobid + "/PARIN" 
239     
240   return return_message
241
242def GetRC(jobid):
243
244   # collect progress information
245   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/RUN_CONTROL"
246   try: 
247      devnull = open(os.devnull, 'w')
248      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
249      return_message = out
250      out = None
251   except:
252      return_message = "Action failed."
253     
254   return return_message
255
256# START OF MAIN
257if ( action == "queue" ):
258   ReadQueue(data)
259elif ( action == "check"):
260   print CheckJob(data)
261elif ( action == "cancel"):
262   print CancelJob(data)
263elif ( action == "start"):
264   print GetStartTime(data)
265elif ( action == "stop"):
266   print DoStopNow(data)
267elif ( action == "restart"):
268   print DoRestartNow(data)
269elif ( action == "parin"):
270   print GetPARIN(data)
271elif ( action == "rc"):
272   print GetRC(data) 
273else:
274   print "Error. Action " + action + " unknown."
275
Note: See TracBrowser for help on using the repository browser.