source: palm/trunk/SCRIPTS/palm_wdd @ 3424

Last change on this file since 3424 was 2825, checked in by maronga, 7 years ago

adjustments in gui tools

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.9 KB
RevLine 
[2825]1#!/usr/bin/env python
[1611]2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
[2696]4# This file is part of the PALM model system.
[1611]5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
[2718]17# Copyright 1997-2018  Leibniz Universitaet Hannover
[1611]18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
[2416]23#
[1611]24# Former revisions:
25# -----------------
26# $Id: palm_wdd 2825 2018-02-20 21:48:27Z gronemeier $
[2825]27# Modified header
28#
29# 2718 2018-01-02 08:49:38Z maronga
[2716]30# Corrected "Former revisions" section
31#
32# 2696 2017-12-14 17:12:51Z kanani
33# Change in file header (GPL part)
34#
35# 2421 2017-09-07 10:36:34Z maronga
[2421]36# Fixed display of job progress.
37#
38# 2416 2017-09-06 14:28:14Z maronga
[2416]39# Adapted for palmrun
40#
41# 1619 2015-07-13 06:53:19Z maronga
[1611]42#
[1619]43# 1618 2015-07-13 06:52:15Z maronga
44# Added steering via configuration file, to be placed in the home directory of the
45# remote host to be monitored.
46#
[1614]47# 1613 2015-07-08 14:53:29Z maronga
48# Bugfix: tooltip for queuing name did not show up on first update.
49# New: added contect menu for showing the parameter file and the run control
50# output
51#
[1612]52# 1611 2015-07-07 12:23:22Z maronga
53# Initial revision
54#
[1611]55#
56# Description:
57# ------------
58# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
59# by the user. The watchdog server requires python 2.7 or higher installed on
60# host to be monitored.
61#
62# Instructions:
63# -------------
64# 1) Modify the header section of palm_wd
65# 2) Move .wd.olddata and .wd.newdata to your palm directory
66#    (e.g. /home/user/current_version/.wd.newdata etc.)
67# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
68#    respective hosts
69# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
70#
71# To do:
72# ------
73# 1) Add "Options", "Help" and "Manual"
74# 2) Move user settings to a configuration file
75#------------------------------------------------------------------------------!
76
[1618]77import ConfigParser
[1611]78import os
[2416]79import pwd
[1618]80from subprocess import check_output
[1611]81import sys
82
[2416]83
[1618]84# Read configuration file
85# First check if the configuration file exists
86if ( os.path.exists('.wdd.config') == False ):
[2416]87    print "Error. No configuration file .wdd.config found."
[1618]88    raise SystemExit     
[1611]89
[1618]90config = ConfigParser.RawConfigParser()
91config.read('.wdd.config')
[1611]92
[1618]93cmd_readqueue      = config.get('Settings', 'readqueue').strip('"')
94cmd_tmpdir         = config.get('Settings', 'tmpdir').strip('"')
95cmd_canceljob      = config.get('Settings', 'canceljob').strip('"')
96cmd_checkjob       = config.get('Settings', 'checkjob').strip('"')
97cmd_realname_grep  = config.get('Settings', 'realname_grep').strip('"')
98cmd_starttime      = config.get('Settings', 'starttime').strip('"')
99cmd_starttime_grep = config.get('Settings', 'starttime_grep').strip('"')
[1611]100
[1618]101
[1611]102action   = str(sys.argv[1])
103data     = str(sys.argv[2])
104
[2416]105cmd_readqueue = cmd_readqueue + " " + pwd.getpwuid( os.getuid() )[ 0 ]
106cmd_tmpdir    = cmd_tmpdir + pwd.getpwuid( os.getuid() )[ 0 ]
[1611]107
108# reading queuing system
109def ReadQueue(username):
110 
111#  collect queuing information
112   try:
113      out = check_output(cmd_readqueue, shell=True)
114      job_list = out.splitlines()
115      out = None
116#  do nothing for empty results list
117   except:
118      job_list = []
119 
120 
121   job_data_tmp = []
122   for j in range(0,len(job_list)):
123
124      # Write temporary data array containing the job information.
125      job_data_tmp.append(j)
126      job_data_tmp[j] = job_list[j].split(" ")
127      job_data_tmp[j] = filter(None, job_data_tmp[j])
128
129      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
130
131      # retrieve real job name for all jobs
132      try: 
133         out = check_output(cmd_realname, shell=True)
134         job_realname = out.split(" ")[1].rstrip()
135      except:
136         job_realname = "error"
137
138
139      # for running jobs, determine progress                 
140      if ( job_data_tmp[j][2] == "Running" ):
141 
142         # collect progress information
[2421]143         cmd_progress = "cat " + cmd_tmpdir + "/" + job_realname + "/PROGRESS"
[1611]144         try: 
145
146            devnull = open(os.devnull, 'w')
147            out = check_output(cmd_progress, shell=True, stderr=devnull)
148            progress_lines = out.splitlines()
[2416]149            job_progress = progress_lines[1].split(" ")[3]
[1611]150            out = None
151         except:
152            job_progress = "0" 
153
154      else:
155         job_progress = "0" 
156
157      # return the job data
158      job_starttime = GetStartTime(job_data_tmp[j][0])
159      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
160
161
162# check details of specific job
163def CheckJob(jobid):
164
165   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
166
167   try: 
168      out = check_output(cmd_checkjob_tmp, shell=True)
169      job_details = out
170   except:
171      job_details = "No details available." 
172
173   return job_details
174
175
176# cancel a specific job
177def CancelJob(jobid):
178
179   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
180
181   try: 
182      out = check_output(cmd_canceljob_tmp, shell=True)
183      job_canceled = out
184   except:
185      job_canceled = "Action failed." 
186
187   return job_canceled
188
189
190# retrieve estimated start time of job
191def GetStartTime(jobid):
192
193   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
194
195   try: 
196      out = check_output(cmd_starttime_tmp, shell=True)
197      job_starttime = out.split()[5]
198   except:
199      job_starttime = "Action failed." 
200
201   return job_starttime
202
203
[2416]204def DoStopNow(jobid):
[1611]205
206   # collect progress information
[2416]207   cmd_dostop = "touch " + cmd_tmpdir + "/" + jobid + "/DO_STOP_NOW"
[1611]208   try: 
209      devnull = open(os.devnull, 'w')
210      out = check_output(cmd_dostop, shell=True, stderr=devnull)
211      out = None
212   except:
213      return_message = "Action failed." 
214      return return_message
215
[2416]216def DoRestartNow(jobid):
[1611]217
218   # collect progress information
[2416]219   cmd_dorestart = "touch " + cmd_tmpdir + "/" + jobid + "/DO_RESTART_NOW"
[1611]220   try: 
221      devnull = open(os.devnull, 'w')
222      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
223      out = None
224   except:
225      return_message = "Action failed." 
226      return return_message
227
[2416]228def GetPARIN(jobid):
[1611]229
[1613]230   # collect progress information
[2416]231   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/PARIN"
[1613]232   try: 
233      devnull = open(os.devnull, 'w')
234      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
235      return_message = out
236      out = None
237   except:
[2416]238      return_message = "Action failed."   + "cat " + cmd_tmpdir + "/" + jobid + "/PARIN" 
[1613]239     
240   return return_message
241
[2416]242def GetRC(jobid):
[1613]243
244   # collect progress information
[2416]245   cmd_dorestart = "cat " + cmd_tmpdir + "/" + jobid + "/RUN_CONTROL"
[1613]246   try: 
247      devnull = open(os.devnull, 'w')
248      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
249      return_message = out
250      out = None
251   except:
[2416]252      return_message = "Action failed."
[1613]253     
254   return return_message
255
[1611]256# START OF MAIN
257if ( action == "queue" ):
258   ReadQueue(data)
259elif ( action == "check"):
260   print CheckJob(data)
261elif ( action == "cancel"):
262   print CancelJob(data)
263elif ( action == "start"):
264   print GetStartTime(data)
265elif ( action == "stop"):
[2416]266   print DoStopNow(data)
[1611]267elif ( action == "restart"):
[2416]268   print DoRestartNow(data)
[1613]269elif ( action == "parin"):
[2416]270   print GetPARIN(data)
[1613]271elif ( action == "rc"):
[2416]272   print GetRC(data) 
[1611]273else:
[2416]274   print "Error. Action " + action + " unknown."
[1611]275
Note: See TracBrowser for help on using the repository browser.