source: palm/trunk/SCRIPTS/palm_wdd @ 1612

Last change on this file since 1612 was 1612, checked in by maronga, 9 years ago

last commit documented

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 6.0 KB
RevLine 
[1611]1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22#
[1612]23#
[1611]24# Former revisions:
25# -----------------
26# $Id: palm_wdd 1612 2015-07-07 12:25:21Z maronga $
27#
[1612]28# 1611 2015-07-07 12:23:22Z maronga
29# Initial revision
30#
[1611]31#
32# Description:
33# ------------
34# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
35# by the user. The watchdog server requires python 2.7 or higher installed on
36# host to be monitored.
37#
38# Instructions:
39# -------------
40# 1) Modify the header section of palm_wd
41# 2) Move .wd.olddata and .wd.newdata to your palm directory
42#    (e.g. /home/user/current_version/.wd.newdata etc.)
43# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
44#    respective hosts
45# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
46#
47# To do:
48# ------
49# 1) Add "Options", "Help" and "Manual"
50# 2) Move user settings to a configuration file
51#------------------------------------------------------------------------------!
52
53import os
54import sys
55from subprocess import check_output
56
57# START OF HEADER
58
59# configuration for host
60cmd_readqueue      = "showq | egrep "
61cmd_tmpdir         = "/gfs1/tmp/"
62cmd_canceljob      = "canceljob"
63cmd_checkjob       = "checkjob"
64cmd_realname_grep  = "AName"
65cmd_starttime      = "showstart"
66cmd_starttime_grep = "start in"
67
68# END OF HEADER
69
70
71action   = str(sys.argv[1])
72data     = str(sys.argv[2])
73if ( len(sys.argv) > 3 ):
74   data2 =  str(sys.argv[3])
75
76
77cmd_readqueue = cmd_readqueue + data
78cmd_tmpdir    = cmd_tmpdir + data
79
80
81
82# reading queuing system
83def ReadQueue(username):
84 
85#  collect queuing information
86   try:
87      out = check_output(cmd_readqueue, shell=True)
88      job_list = out.splitlines()
89      out = None
90#  do nothing for empty results list
91   except:
92      job_list = []
93 
94 
95   job_data_tmp = []
96   for j in range(0,len(job_list)):
97
98      # Write temporary data array containing the job information.
99      job_data_tmp.append(j)
100      job_data_tmp[j] = job_list[j].split(" ")
101      job_data_tmp[j] = filter(None, job_data_tmp[j])
102
103      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
104
105      # retrieve real job name for all jobs
106      try: 
107         out = check_output(cmd_realname, shell=True)
108         job_realname = out.split(" ")[1].rstrip()
109      except:
110         job_realname = "error"
111
112
113      # for running jobs, determine progress                 
114      if ( job_data_tmp[j][2] == "Running" ):
115 
116         # collect progress information
117         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
118         try: 
119
120            devnull = open(os.devnull, 'w')
121            out = check_output(cmd_progress, shell=True, stderr=devnull)
122            progress_lines = out.splitlines()
123            job_progress = progress_lines[1].split(" ")[1]
124            out = None
125         except:
126            job_progress = "0" 
127
128      else:
129         job_progress = "0" 
130
131      # return the job data
132      job_starttime = GetStartTime(job_data_tmp[j][0])
133      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
134
135
136# check details of specific job
137def CheckJob(jobid):
138
139   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
140
141   try: 
142      out = check_output(cmd_checkjob_tmp, shell=True)
143      job_details = out
144   except:
145      job_details = "No details available." 
146
147   return job_details
148
149
150# cancel a specific job
151def CancelJob(jobid):
152
153   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
154
155   try: 
156      out = check_output(cmd_canceljob_tmp, shell=True)
157      job_canceled = out
158   except:
159      job_canceled = "Action failed." 
160
161   return job_canceled
162
163
164# retrieve estimated start time of job
165def GetStartTime(jobid):
166
167   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
168
169   try: 
170      out = check_output(cmd_starttime_tmp, shell=True)
171      job_starttime = out.split()[5]
172   except:
173      job_starttime = "Action failed." 
174
175   return job_starttime
176
177
178def DoStopNow(username,jobid):
179
180   # collect progress information
181   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
182   try: 
183      devnull = open(os.devnull, 'w')
184      out = check_output(cmd_dostop, shell=True, stderr=devnull)
185      out = None
186   except:
187      return_message = "Action failed." 
188      return return_message
189
190def DoRestartNow(username,jobid):
191
192   # collect progress information
193   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
194   try: 
195      devnull = open(os.devnull, 'w')
196      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
197      out = None
198   except:
199      return_message = "Action failed." 
200      return return_message
201
202
203# START OF MAIN
204if ( action == "queue" ):
205   ReadQueue(data)
206elif ( action == "check"):
207   print CheckJob(data)
208elif ( action == "cancel"):
209   print CancelJob(data)
210elif ( action == "start"):
211   print GetStartTime(data)
212elif ( action == "stop"):
213   print DoStopNow(data,data2)
214elif ( action == "restart"):
215   print DoRestartNow(data,data2)
216else:
217   print "Error. Action " + action + " unknown."
218
Note: See TracBrowser for help on using the repository browser.