source: palm/trunk/SCRIPTS/palm_wdd @ 1611

Last change on this file since 1611 was 1611, checked in by maronga, 9 years ago

added new palm watchdog, removed old nc2vdf scripts

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 6.0 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#--------------------------------------------------------------------------------#
4# This file is part of PALM.
5#
6# PALM is free software: you can redistribute it and/or modify it under the terms
7# of the GNU General Public License as published by the Free Software Foundation,
8# either version 3 of the License, or (at your option) any later version.
9#
10# PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License along with
15# PALM. If not, see <http://www.gnu.org/licenses/>.
16#
17# Copyright 1997-2015  Leibniz Universitaet Hannover
18#--------------------------------------------------------------------------------#
19#
20# Current revisions:
21# -----------------
22# Initial revision
23#
24# Former revisions:
25# -----------------
26# $Id: palm_wdd 1611 2015-07-07 12:23:22Z maronga $
27#
28#
29# Description:
30# ------------
31# PALM watchdog client for monitoring batch jobs on a variety of hosts specified
32# by the user. The watchdog server requires python 2.7 or higher installed on
33# host to be monitored.
34#
35# Instructions:
36# -------------
37# 1) Modify the header section of palm_wd
38# 2) Move .wd.olddata and .wd.newdata to your palm directory
39#    (e.g. /home/user/current_version/.wd.newdata etc.)
40# 3) Modify a copy of palm_wdd for each host to be monitored and move it to the
41#    respective hosts
42# 4) Start the client either from mrungui or from shell by "nohup palm_wd&"
43#
44# To do:
45# ------
46# 1) Add "Options", "Help" and "Manual"
47# 2) Move user settings to a configuration file
48#------------------------------------------------------------------------------!
49
50import os
51import sys
52from subprocess import check_output
53
54# START OF HEADER
55
56# configuration for host
57cmd_readqueue      = "showq | egrep "
58cmd_tmpdir         = "/gfs1/tmp/"
59cmd_canceljob      = "canceljob"
60cmd_checkjob       = "checkjob"
61cmd_realname_grep  = "AName"
62cmd_starttime      = "showstart"
63cmd_starttime_grep = "start in"
64
65# END OF HEADER
66
67
68action   = str(sys.argv[1])
69data     = str(sys.argv[2])
70if ( len(sys.argv) > 3 ):
71   data2 =  str(sys.argv[3])
72
73
74cmd_readqueue = cmd_readqueue + data
75cmd_tmpdir    = cmd_tmpdir + data
76
77
78
79# reading queuing system
80def ReadQueue(username):
81 
82#  collect queuing information
83   try:
84      out = check_output(cmd_readqueue, shell=True)
85      job_list = out.splitlines()
86      out = None
87#  do nothing for empty results list
88   except:
89      job_list = []
90 
91 
92   job_data_tmp = []
93   for j in range(0,len(job_list)):
94
95      # Write temporary data array containing the job information.
96      job_data_tmp.append(j)
97      job_data_tmp[j] = job_list[j].split(" ")
98      job_data_tmp[j] = filter(None, job_data_tmp[j])
99
100      cmd_realname = cmd_checkjob + " " + job_data_tmp[j][0] + "|grep " + cmd_realname_grep
101
102      # retrieve real job name for all jobs
103      try: 
104         out = check_output(cmd_realname, shell=True)
105         job_realname = out.split(" ")[1].rstrip()
106      except:
107         job_realname = "error"
108
109
110      # for running jobs, determine progress                 
111      if ( job_data_tmp[j][2] == "Running" ):
112 
113         # collect progress information
114         cmd_progress = "cat " + cmd_tmpdir + "/" + username + "." + job_data_tmp[j][0].partition(".")[2] + "/PROGRESS"
115         try: 
116
117            devnull = open(os.devnull, 'w')
118            out = check_output(cmd_progress, shell=True, stderr=devnull)
119            progress_lines = out.splitlines()
120            job_progress = progress_lines[1].split(" ")[1]
121            out = None
122         except:
123            job_progress = "0" 
124
125      else:
126         job_progress = "0" 
127
128      # return the job data
129      job_starttime = GetStartTime(job_data_tmp[j][0])
130      print job_realname + " " + job_data_tmp[j][0] + " " + job_data_tmp[j][3] + " " + job_data_tmp[j][2] + " " + job_progress + " "  + job_data_tmp[j][4] + " " + job_starttime
131
132
133# check details of specific job
134def CheckJob(jobid):
135
136   cmd_checkjob_tmp = cmd_checkjob + " " + jobid
137
138   try: 
139      out = check_output(cmd_checkjob_tmp, shell=True)
140      job_details = out
141   except:
142      job_details = "No details available." 
143
144   return job_details
145
146
147# cancel a specific job
148def CancelJob(jobid):
149
150   cmd_canceljob_tmp = cmd_canceljob + " " + jobid
151
152   try: 
153      out = check_output(cmd_canceljob_tmp, shell=True)
154      job_canceled = out
155   except:
156      job_canceled = "Action failed." 
157
158   return job_canceled
159
160
161# retrieve estimated start time of job
162def GetStartTime(jobid):
163
164   cmd_starttime_tmp = cmd_starttime + " " + jobid + "|grep \"" + cmd_starttime_grep + "\"" 
165
166   try: 
167      out = check_output(cmd_starttime_tmp, shell=True)
168      job_starttime = out.split()[5]
169   except:
170      job_starttime = "Action failed." 
171
172   return job_starttime
173
174
175def DoStopNow(username,jobid):
176
177   # collect progress information
178   cmd_dostop = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_STOP_NOW"
179   try: 
180      devnull = open(os.devnull, 'w')
181      out = check_output(cmd_dostop, shell=True, stderr=devnull)
182      out = None
183   except:
184      return_message = "Action failed." 
185      return return_message
186
187def DoRestartNow(username,jobid):
188
189   # collect progress information
190   cmd_dorestart = "touch " + cmd_tmpdir + "/" + username + "." + jobid.partition(".")[2] + "/DO_RESTART_NOW"
191   try: 
192      devnull = open(os.devnull, 'w')
193      out = check_output(cmd_dorestart, shell=True, stderr=devnull)
194      out = None
195   except:
196      return_message = "Action failed." 
197      return return_message
198
199
200# START OF MAIN
201if ( action == "queue" ):
202   ReadQueue(data)
203elif ( action == "check"):
204   print CheckJob(data)
205elif ( action == "cancel"):
206   print CancelJob(data)
207elif ( action == "start"):
208   print GetStartTime(data)
209elif ( action == "stop"):
210   print DoStopNow(data,data2)
211elif ( action == "restart"):
212   print DoRestartNow(data,data2)
213else:
214   print "Error. Action " + action + " unknown."
215
Note: See TracBrowser for help on using the repository browser.