source: palm/trunk/TUTORIAL/SOURCE/restarts_with_mrun.tex @ 1626

Last change on this file since 1626 was 1515, checked in by boeske, 10 years ago

several updates in the tutorial

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 16.1 KB
Line 
1% $Id: restarts_with_mrun.tex 1515 2015-01-02 11:35:51Z heinze $
2\input{header_tmp.tex}
3%\input{../header_lectures.tex}
4
5\usepackage[utf8]{inputenc}
6\usepackage{ngerman}
7\usepackage{pgf}
8\usepackage{subfigure}
9\usepackage{units}
10\usepackage{multimedia}
11\usepackage{hyperref}
12\newcommand{\event}[1]{\newcommand{\eventname}{#1}}
13\usepackage{xmpmulti}
14\usepackage{tikz}
15\usetikzlibrary{shapes,arrows,positioning}
16\usetikzlibrary{decorations.markings}
17\usetikzlibrary{decorations.pathreplacing}
18\def\Tiny{\fontsize{4pt}{4pt}\selectfont}
19\usepackage{amsmath}
20\usepackage{amssymb}
21\usepackage{multicol}
22\usepackage{pdfcomment}
23\usepackage{graphicx}
24\usepackage{listings}
25\lstset{showspaces=false,language=fortran,basicstyle=
26        \ttfamily,showstringspaces=false,captionpos=b}
27
28\institute{Institute of Meteorology and Climatology, Leibniz UniversitÀt Hannover}
29\selectlanguage{english}
30\date{last update: \today}
31\event{PALM Seminar}
32\setbeamertemplate{navigation symbols}{}
33
34\setbeamertemplate{footline}
35  {
36    \begin{beamercolorbox}[rightskip=-0.1cm]&
37     {\includegraphics[height=0.65cm]{imuk_logo.pdf}\hfill \includegraphics[height=0.65cm]{luh_logo.pdf}}
38    \end{beamercolorbox}
39    \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,
40      leftskip=.3cm,rightskip=0.3cm plus1fil]{title in head/foot}
41      {\leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor} \hfill \eventname \hfill \insertframenumber \; / \inserttotalframenumber}
42    \end{beamercolorbox}
43    \begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot}
44    \end{beamercolorbox}
45  }
46%\logo{\includegraphics[width=0.3\textwidth]{luhimuk_logo.pdf}}
47
48\title[Carrying out restart runs with mrun]{Carrying out restart runs with \texttt{mrun}}
49\author{PALM group}
50
51\begin{document}
52
53% Folie 1
54\begin{frame}
55   \titlepage
56\end{frame}
57
58\section{Carrying out restart runs with mrun}
59\subsection{Carrying out restart runs with mrun}
60
61         
62
63% Folie 2
64\begin{frame}
65   \frametitle{Definition of “restart run“}
66 
67   \begin{itemize}
68      \item<1-> A \textbf{“restart run“} is a model run, which starts with an initial condition given by the simulated flow at the end of a previous (restart or initial) run.
69      \item<2-> In order to carry out a restart run, a file has to be written at the end of the previous run, which contains the values of all flow variables at the last time step. This file has to be read at the beginning of the restart run.
70      \item<3-> Initial and respective restart runs form a so called \textbf{job chain}.
71   \end{itemize} 
72
73\end{frame}
74
75
76% Folie 3
77\begin{frame}
78   \frametitle{Reasons for Restart Runs}
79 
80   \begin{itemize}
81      \item<1-> The maximum job time is generally limited by the queuing system:
82      \begin{itemize}
83         \item<1-> simulations must be split into several parts
84      \end{itemize} 
85      \item<2-> The user wants to carry out several runs on the basis of the same initial temporal development:
86      \begin{itemize}
87         \item<1-> the initial phase needs to be simulated only once,
88all runs start from the end point of this initial phase by reading the flow field data written at the end of the initial run
89      \end{itemize}
90   \end{itemize} 
91
92\end{frame}
93
94
95% Folie 4
96\begin{frame}
97   \frametitle{Carrying Out Restart Runs With \texttt{mrun}}
98   \scriptsize
99   \begin{columns}[T]
100      \begin{column}{1.0\textwidth}
101         Concerning \texttt{mrun}, the first thing required to enable restart runs is to use the additional activating string \grqq \texttt{restart}\grqq\, in the \texttt{mrun}-call for the \underline{initial run}:\\
102         \vspace{1mm}
103         \quad \texttt{mrun -d test ... -r \dq d3\# restart\dq}\\
104         \ \\
105         This will have the following effects:
106         \vspace{1mm}
107         \tiny
108         \begin{itemize}
109            \item<2-> At the end of the run, all necessary variables will bei written as binary data to the local file \texttt{BINOUT}. This is caused by an entry in the configuration file\\
110            \vspace{1mm}
111            \quad \texttt{\%write\underline{ }binary true restart}\\
112            \vspace{1mm}
113            which sets the environment variable \texttt{write\underline{ }binary}, which is in turn read by PALM from the local file \texttt{ENVPAR} created by \texttt{mrun}.
114            \vspace{3mm}
115            \item<3-> This binary file will be permanently stored in case that an appropriate file connection statement exists\\
116            \vspace{1mm}
117            \quad \texttt{BINOUT  out:loc:flpe restart \~{}/palm/current\underline{ }version/JOBS/\$fname/RESTART  \underline{ }d3d}
118            \vspace{3mm}
119            \item<4-> If, during the run, PALM detects that the simulation cannot be finished due to limited job time, it tells \texttt{mrun} (by creating a local file named \texttt{CONTINUE\underline{ }RUN}) that a restart job has to be started. \texttt{mrun} will then automatically start such a job by submitting the command\\
120            \vspace{1mm}
121            \quad \texttt{mrun -d test ... -r \dq d3f restart\dq}\\
122            \vspace{1mm}
123            on the \textbf{local host}. Options of this command are nearly the same as of the initial run, but every sharp character (\grqq\#\grqq) in the activating strings is replaced by an \grqq f\grqq.
124         \end{itemize}
125         \scriptsize
126         \vspace{2mm}
127         \onslide<5->\textcolor{red}{\textbf{This effects the activation of file connections for the restart job!}}
128      \end{column}
129   \end{columns}
130
131\end{frame}
132
133
134% Folie 5
135\begin{frame}
136   \frametitle{Input Files Necessary For Restart Jobs}
137   \scriptsize
138   \vspace{3mm}
139   File connection statements for input files from the default \texttt{.mrun.config} file:\\
140   \quad \texttt{PARIN \hspace{0.5em} in:job \hspace{3em} d3\# \hspace{0.5em} \$base\underline{ }data/\$fname/INPUT \hspace{1.5em} \underline{ }p3d}\\
141   \quad \texttt{PARIN \hspace{0.5em} in:job \hspace{3em} d3f \hspace{0.5em} \$base\underline{ }data/\$fname/INPUT \hspace{1.5em} \underline{ }p3df}\\
142   \quad \texttt{BININ \hspace{0.5em} in:loc:flpe \hspace{0.5em} d3f \hspace{0.5em} \$base\underline{ }data/\$fname/RESTART \hspace{0.5em} \underline{ }d3d}\\
143   \vspace{4mm}
144   \begin{itemize}
145      \item<2-> For the restart job, the model receives a different parameter file than for the initial job (e.g. \texttt{example\underline{ }cbl\underline{ }p3d\textcolor{blue}{f}} instead of \texttt{example\underline{ }cbl\underline{ }p3d}).\\
146   \vspace{4mm}
147   The parameter file for the restart job is nearly the same as for the initial run, but it must contain the parameter setting\\
148   \vspace{1mm}
149   \quad \texttt{initializing\underline{ }actions = 'read\underline{ }restart\underline{ }data'}\\
150   \vspace{1mm}
151   in the \texttt{\&inipar}-NAMELIST-group. All other \texttt{\&inipar}-parameter-settings are ignored!\\
152   \vspace{4mm}
153   \texttt{\&d3par}-parameter values can freely be changed compared with the parameter file for the initial run.\\
154   \vspace{4mm}
155   \item<3-> Input binary data file (\texttt{BININ}) is necessary (and available) only for\\ restart jobs
156   \end{itemize}
157\end{frame}
158
159
160% Folie 6
161\begin{frame}
162   \frametitle{Output File Handling in Restart Jobs }
163   \scriptsize
164   \vspace{2mm} 
165   Example for output file connection statements from the default \texttt{.mrun.config} file:\\
166   \vspace{2mm}
167   \quad \texttt{RUN\underline{ }CONTROL \hspace{0.5em} out:loc:tr \hspace{1em} d3\# \hspace{0.5em} \$base\underline{ }data/\$fname/MONITORING \hspace{0.5em} \underline{ }rc}\\
168   \quad \texttt{RUN\underline{ }CONTROL \hspace{0.5em} out:loc:tra \hspace{0.5em} d3f \hspace{0.5em} \$base\underline{ }data/\$fname/MONITORING \hspace{0.5em} \underline{ }rc}\\
169   \vspace{2mm}
170   In case of restart jobs, the contents of many local output files are appended to the respective permanent files from the initial or previous run by using the \texttt{tra} file attribute.\\
171   \vspace{6mm}
172   \onslide<2-> File connection statement example for appending netCDF files when PALM is running on a remote host:\\
173   \quad \texttt{DATA\underline{ }1D\underline{ }PR\underline{ }NETCDF\hspace{1em}in:loc\hspace{2.5em}prf\hspace{3em}\$base\underline{ }data/\$fname/OUTPUT\hspace{0.5em}\underline{ }pr\hspace{0.5em}nc}\\
174   \quad \texttt{DATA\underline{ }1D\underline{ }PR\underline{ }NETCDF\hspace{1em}out:loc\hspace{2em}pr\#:prf\hspace{1em}\$base\underline{ }data/\$fname/OUTPUT\hspace{0.5em}\underline{ }pr\hspace{0.5em}nc}\\
175   \quad \texttt{DATA\underline{ }1D\underline{ }PR\underline{ }NETCDF\hspace{1em}out:loc:tr\hspace{0.5em}pr\#:prf\hspace{1em}\$base\underline{ }data/\$fname/OUTPUT\hspace{0.5em}\underline{ }pr\hspace{0.5em}nc}\\
176   \vspace{2mm}
177   The netCDF file from the respective previous run has to be provided as an INPUT file.\\
178   \vspace{2mm}
179   Therefore, if running PALM on a remote host, a copy of this data file must be additionally stored on the remote host (second statement). On the local host, each run creates a new file (cycle) which contains the complete data from the current run and all previous runs.
180   
181\end{frame}
182
183
184% Folie 7
185\begin{frame}
186   \frametitle{Handling of Large Binary Data Files}
187   \scriptsize
188   \begin{columns}
189      \column{1.1\textwidth}
190      \vspace{-1mm}
191      \begin{itemize}
192         \item<1-> Typically, the binary restart files are very large, so that they cannot be stored in the user's home-directory because of limited disk quotas. Also, hard disks where \texttt{/home} is stored are typically very slow, so that the copy process needs very long time.
193         \vspace{1mm}
194         \item<2-> Using the file attribute \texttt{fl} (abbreviation for german \grqq Fortsetzungslauf\grqq) in the output file connection statement causes \texttt{mrun} to copy the local file to a special directory, which can be defined in the configuration file by the environment variable \texttt{tmp\underline{ }data\underline{ }catalog}. The permanent file described in the connection statement is also created, but it is \textbf{empty}.
195         \vspace{1mm}
196         \item<3-> At the end of the job, the second last cycle of the respective file with attribute \texttt{fl} is automatically deleted by \texttt{mrun} from the \texttt{tmp\underline{ }data\underline{ }catalog} in order to save disk space. This can be switched off with \texttt{mrun}-option \grqq\texttt{-k}\grqq (keep data from previous run).
197      \end{itemize}
198   \end{columns}
199   \vspace{2mm}
200      \onslide<4-> \textbf{Example:}\\
201      \tiny \quad \texttt{\%base\underline{ }data\hspace{4.5em}\~{}/palm/current\underline{ }version/JOBS}\\
202      \tiny \quad \texttt{\%tmp\underline{ }data\underline{ }catalog\hspace{1.0em}/gfs1/work/niksiraa/palm\underline{ }restart\underline{ }data}\\
203      \vspace{1mm}
204      \tiny \quad \texttt{BINOUT\hspace{1.0em}out:loc:flpe\hspace{1.0em}restart\hspace{1.0em}\$base\underline{ }data/\$fname/RESTART\hspace{1.0em}\underline{ }d3d}\\
205      \ \\
206      \onslide<5-> \scriptsize \textbf{Files (directories) created when using \texttt{-d example\underline{ }cbl}:}  \\
207      \tiny \quad \texttt{/gfs1/work/niksiraa/palm\underline{ }restart\underline{ }data/example\underline{ }cbl\underline{ }d3d}\\
208      \tiny \quad \texttt{\~{}/palm/current\underline{ }version/JOBS/example/RESTART/example\underline{ }cbl\underline{ }d3d \# empty file (directory)}\\
209   \vspace{2mm}
210   \onslide<6-> \scriptsize \textcolor{red}{Concerning input files, \texttt{mrun} always determines the current cycle number to be \underline{used from the contents of the directory given by the file connection}\\ \underline{statement!}}
211
212\end{frame}
213
214
215% Folie 8
216\begin{frame}[fragile]
217   \frametitle{Checking the Restart Job Execution}
218   \tikzstyle{yellow} = [rectangle, draw, fill=yellow!30, text width=1.2\textwidth, font=\Tiny,scale=0.8]
219   \scriptsize
220   \vspace{-40mm}
221   \begin{itemize}
222      \item essentially by looking at the messages in the job protocol file:
223   \end{itemize} 
224   \centering
225   \begin{tikzpicture}[remember picture, overlay]
226   \node [yellow] (terminal)  at (0mm,-30mm) {\begin{lstlisting} 
227  *** execution starts in directory
228      "/gfs1/work/nikleboe/nikleboe.21239"
229  ----------------------------------------------------------------------------
230.
231.
232    run will be terminated due to user settings of
233    restart_time / dt_restart
234    new restart time is:  3600.  s
235.
236.
237  ----------------------------------------------------------------------------
238  *** execution finished
239.
240.
241 ----------------------------------------------------------------------------
242  *** all OUTPUT-files saved
243
244
245  *** initiating restart-run on "130.75.105.111" using command:
246      mrun -c.mrun.config -dexample_cbl -hlccrayh -Hlcmuk -m1500 -t3600 -qmpp1testq -R130.75.105.111 -Uboeske ...
247
248  ----------------------------------------------------------------------------
249*** ssh will be used to initiate restart-runs!
250.
251.
252*** MRUN  2.1 Rev: 1358 $ 
253    will be executed.     Please wait ...
254#------------------------------------------------------------------------#
255| MRUN  2.1 Rev: 1358 $                    Thu Jun 31 14:09:30 CEST 2014 |
256|                                                                        |
257| called on:               vaudaire                                      |
258.
259.
260| Files to be compiled:                                                  |
261| palm.f90 user_example.f90                                              |
262#------------------------------------------------------------------------#
263.
264.
265  ----------------------------------------------------------------------------
266  *** restart-run initiated
267
268
269 --> all actions finished
270
271     Bye, bye nikleboe !!
272\end{lstlisting}
273         };
274      \node[rectangle, draw,text width=0.29\textwidth, fill=white] at (35mm,-5mm) {\noindent \scriptsize In this example, restart time has been set
275     
276      manually by the user.};
277   \end{tikzpicture}
278
279\end{frame}
280
281
282% Folie 9
283\begin{frame}
284   \frametitle{Setting the Restart Time Manually}
285   \scriptsize
286   \begin{columns}
287      \column{1.07\textwidth}
288      \begin{itemize}
289         \item<1-> By default, PALM checks after every timestep, if enough time remains from the job's cpu limit to carry out the next timestep:\\
290         \vspace{1mm}
291         (\quad \textcolor{red}{\grqq\texttt{total job time}\grqq\,} - \grqq\texttt{time already consumed}\grqq\,) \texttt{<=} \textcolor{blue}{\texttt{termination\underline{ }time\underline{ }needed}}\\
292         (\textcolor{red}{as given by \texttt{mrun}-option \texttt{-t} ...}) \hspace{5mm}  (\textcolor{blue}{as given by parameter in \texttt{\&d3par}-NAMELIST})\\
293         \vspace{3mm}
294         \item<2-> \texttt{termination\underline{ }time\underline{ }needed} has to include the cpu time needed before running PALM (e.g. for compilation, copying of input data, etc.; default value: 300 s)!\\
295         \ \\
296         \onslide<3-> \textbf{Warning:}\\
297         \vspace{1mm}
298         \quad \quad \grqq\texttt{total job time}\grqq\, \texttt{<=} \texttt{termination\underline{ }time\underline{ }needed},\\
299         \quad forces a restart after every timestep!
300         \vspace{3mm}
301         \item<4-> \texttt{\&d3par}-parameters \texttt{restart\underline{ }time} and \texttt{dt\underline{ }restart} can be used to set restart time(s) manually.\\
302         \vspace{3mm}
303         \item<5-> In case of manually setting the restart time, the default checking (see above) is still active and a restart will be automatically forced if the job reaches its cpu limit, even if the manually set restart time has not been reached!\\
304      \end{itemize} 
305   \end{columns}
306\end{frame}
307
308
309% Folie 10
310\begin{frame}
311   \frametitle{Starting Restart Jobs Manually}
312   \scriptsize
313      \begin{itemize}
314         \item<1-> After a job has finished (\texttt{end\underline{ }time} has been reached), the user can submit a restart job manually (provided that restart data have been saved) by entering:\\
315         \vspace{2mm}
316         \quad \texttt{mrun ... -r \dq d3f ...\dq\, ...}\\
317         or\\
318              \quad \texttt{mrun ... -r \dq d3f restart ...\dq\, ...}\\
319              \ \\
320         \item<2-> Remember to increase the value of \texttt{end\underline{ }time} in the parameter file before submitting the job.
321         \vspace{2mm}
322         \item<3-> If a manually started restart job shall continue a run of a former job chain which is somewhere in the middle of this chain, all binary files with respective higher cycle numbers have to be deleted or removed from their respective directories.
323   \end{itemize} 
324\end{frame}
325
326\end{document}
Note: See TracBrowser for help on using the repository browser.