% $Id: restarts_with_mrun.tex 1515 2015-01-02 11:35:51Z gronemeier $ \input{header_tmp.tex} %\input{../header_lectures.tex} \usepackage[utf8]{inputenc} \usepackage{ngerman} \usepackage{pgf} \usepackage{subfigure} \usepackage{units} \usepackage{multimedia} \usepackage{hyperref} \newcommand{\event}[1]{\newcommand{\eventname}{#1}} \usepackage{xmpmulti} \usepackage{tikz} \usetikzlibrary{shapes,arrows,positioning} \usetikzlibrary{decorations.markings} \usetikzlibrary{decorations.pathreplacing} \def\Tiny{\fontsize{4pt}{4pt}\selectfont} \usepackage{amsmath} \usepackage{amssymb} \usepackage{multicol} \usepackage{pdfcomment} \usepackage{graphicx} \usepackage{listings} \lstset{showspaces=false,language=fortran,basicstyle= \ttfamily,showstringspaces=false,captionpos=b} \institute{Institute of Meteorology and Climatology, Leibniz Universität Hannover} \selectlanguage{english} \date{last update: \today} \event{PALM Seminar} \setbeamertemplate{navigation symbols}{} \setbeamertemplate{footline} { \begin{beamercolorbox}[rightskip=-0.1cm]& {\includegraphics[height=0.65cm]{imuk_logo.pdf}\hfill \includegraphics[height=0.65cm]{luh_logo.pdf}} \end{beamercolorbox} \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex, leftskip=.3cm,rightskip=0.3cm plus1fil]{title in head/foot} {\leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor} \hfill \eventname \hfill \insertframenumber \; / \inserttotalframenumber} \end{beamercolorbox} \begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot} \end{beamercolorbox} } %\logo{\includegraphics[width=0.3\textwidth]{luhimuk_logo.pdf}} \title[Carrying out restart runs with mrun]{Carrying out restart runs with \texttt{mrun}} \author{PALM group} \begin{document} % Folie 1 \begin{frame} \titlepage \end{frame} \section{Carrying out restart runs with mrun} \subsection{Carrying out restart runs with mrun} % Folie 2 \begin{frame} \frametitle{Definition of “restart run“} \begin{itemize} \item<1-> A \textbf{“restart run“} is a model run, which starts with an initial condition given by the simulated flow at the end of a previous (restart or initial) run. \item<2-> In order to carry out a restart run, a file has to be written at the end of the previous run, which contains the values of all flow variables at the last time step. This file has to be read at the beginning of the restart run. \item<3-> Initial and respective restart runs form a so called \textbf{job chain}. \end{itemize} \end{frame} % Folie 3 \begin{frame} \frametitle{Reasons for Restart Runs} \begin{itemize} \item<1-> The maximum job time is generally limited by the queuing system: \begin{itemize} \item<1-> simulations must be split into several parts \end{itemize} \item<2-> The user wants to carry out several runs on the basis of the same initial temporal development: \begin{itemize} \item<1-> the initial phase needs to be simulated only once, all runs start from the end point of this initial phase by reading the flow field data written at the end of the initial run \end{itemize} \end{itemize} \end{frame} % Folie 4 \begin{frame} \frametitle{Carrying Out Restart Runs With \texttt{mrun}} \scriptsize \begin{columns}[T] \begin{column}{1.0\textwidth} Concerning \texttt{mrun}, the first thing required to enable restart runs is to use the additional activating string \grqq \texttt{restart}\grqq\, in the \texttt{mrun}-call for the \underline{initial run}:\\ \vspace{1mm} \quad \texttt{mrun -d test ... -r \dq d3\# restart\dq}\\ \ \\ This will have the following effects: \vspace{1mm} \tiny \begin{itemize} \item<2-> At the end of the run, all necessary variables will bei written as binary data to the local file \texttt{BINOUT}. This is caused by an entry in the configuration file\\ \vspace{1mm} \quad \texttt{\%write\underline{ }binary true restart}\\ \vspace{1mm} which sets the environment variable \texttt{write\underline{ }binary}, which is in turn read by PALM from the local file \texttt{ENVPAR} created by \texttt{mrun}. \vspace{3mm} \item<3-> This binary file will be permanently stored in case that an appropriate file connection statement exists\\ \vspace{1mm} \quad \texttt{BINOUT out:loc:flpe restart \~{}/palm/current\underline{ }version/JOBS/\$fname/RESTART \underline{ }d3d} \vspace{3mm} \item<4-> If, during the run, PALM detects that the simulation cannot be finished due to limited job time, it tells \texttt{mrun} (by creating a local file named \texttt{CONTINUE\underline{ }RUN}) that a restart job has to be started. \texttt{mrun} will then automatically start such a job by submitting the command\\ \vspace{1mm} \quad \texttt{mrun -d test ... -r \dq d3f restart\dq}\\ \vspace{1mm} on the \textbf{local host}. Options of this command are nearly the same as of the initial run, but every sharp character (\grqq\#\grqq) in the activating strings is replaced by an \grqq f\grqq. \end{itemize} \scriptsize \vspace{2mm} \onslide<5->\textcolor{red}{\textbf{This effects the activation of file connections for the restart job!}} \end{column} \end{columns} \end{frame} % Folie 5 \begin{frame} \frametitle{Input Files Necessary For Restart Jobs} \scriptsize \vspace{3mm} File connection statements for input files from the default \texttt{.mrun.config} file:\\ \quad \texttt{PARIN \hspace{0.5em} in:job \hspace{3em} d3\# \hspace{0.5em} \$base\underline{ }data/\$fname/INPUT \hspace{1.5em} \underline{ }p3d}\\ \quad \texttt{PARIN \hspace{0.5em} in:job \hspace{3em} d3f \hspace{0.5em} \$base\underline{ }data/\$fname/INPUT \hspace{1.5em} \underline{ }p3df}\\ \quad \texttt{BININ \hspace{0.5em} in:loc:flpe \hspace{0.5em} d3f \hspace{0.5em} \$base\underline{ }data/\$fname/RESTART \hspace{0.5em} \underline{ }d3d}\\ \vspace{4mm} \begin{itemize} \item<2-> For the restart job, the model receives a different parameter file than for the initial job (e.g. \texttt{example\underline{ }cbl\underline{ }p3d\textcolor{blue}{f}} instead of \texttt{example\underline{ }cbl\underline{ }p3d}).\\ \vspace{4mm} The parameter file for the restart job is nearly the same as for the initial run, but it must contain the parameter setting\\ \vspace{1mm} \quad \texttt{initializing\underline{ }actions = 'read\underline{ }restart\underline{ }data'}\\ \vspace{1mm} in the \texttt{\&inipar}-NAMELIST-group. All other \texttt{\&inipar}-parameter-settings are ignored!\\ \vspace{4mm} \texttt{\&d3par}-parameter values can freely be changed compared with the parameter file for the initial run.\\ \vspace{4mm} \item<3-> Input binary data file (\texttt{BININ}) is necessary (and available) only for\\ restart jobs \end{itemize} \end{frame} % Folie 6 \begin{frame} \frametitle{Output File Handling in Restart Jobs } \scriptsize \vspace{2mm} Example for output file connection statements from the default \texttt{.mrun.config} file:\\ \vspace{2mm} \quad \texttt{RUN\underline{ }CONTROL \hspace{0.5em} out:loc:tr \hspace{1em} d3\# \hspace{0.5em} \$base\underline{ }data/\$fname/MONITORING \hspace{0.5em} \underline{ }rc}\\ \quad \texttt{RUN\underline{ }CONTROL \hspace{0.5em} out:loc:tra \hspace{0.5em} d3f \hspace{0.5em} \$base\underline{ }data/\$fname/MONITORING \hspace{0.5em} \underline{ }rc}\\ \vspace{2mm} In case of restart jobs, the contents of many local output files are appended to the respective permanent files from the initial or previous run by using the \texttt{tra} file attribute.\\ \vspace{6mm} \onslide<2-> File connection statement example for appending netCDF files when PALM is running on a remote host:\\ \quad \texttt{DATA\underline{ }1D\underline{ }PR\underline{ }NETCDF\hspace{1em}in:loc\hspace{2.5em}prf\hspace{3em}\$base\underline{ }data/\$fname/OUTPUT\hspace{0.5em}\underline{ }pr\hspace{0.5em}nc}\\ \quad \texttt{DATA\underline{ }1D\underline{ }PR\underline{ }NETCDF\hspace{1em}out:loc\hspace{2em}pr\#:prf\hspace{1em}\$base\underline{ }data/\$fname/OUTPUT\hspace{0.5em}\underline{ }pr\hspace{0.5em}nc}\\ \quad \texttt{DATA\underline{ }1D\underline{ }PR\underline{ }NETCDF\hspace{1em}out:loc:tr\hspace{0.5em}pr\#:prf\hspace{1em}\$base\underline{ }data/\$fname/OUTPUT\hspace{0.5em}\underline{ }pr\hspace{0.5em}nc}\\ \vspace{2mm} The netCDF file from the respective previous run has to be provided as an INPUT file.\\ \vspace{2mm} Therefore, if running PALM on a remote host, a copy of this data file must be additionally stored on the remote host (second statement). On the local host, each run creates a new file (cycle) which contains the complete data from the current run and all previous runs. \end{frame} % Folie 7 \begin{frame} \frametitle{Handling of Large Binary Data Files} \scriptsize \begin{columns} \column{1.1\textwidth} \vspace{-1mm} \begin{itemize} \item<1-> Typically, the binary restart files are very large, so that they cannot be stored in the user's home-directory because of limited disk quotas. Also, hard disks where \texttt{/home} is stored are typically very slow, so that the copy process needs very long time. \vspace{1mm} \item<2-> Using the file attribute \texttt{fl} (abbreviation for german \grqq Fortsetzungslauf\grqq) in the output file connection statement causes \texttt{mrun} to copy the local file to a special directory, which can be defined in the configuration file by the environment variable \texttt{tmp\underline{ }data\underline{ }catalog}. The permanent file described in the connection statement is also created, but it is \textbf{empty}. \vspace{1mm} \item<3-> At the end of the job, the second last cycle of the respective file with attribute \texttt{fl} is automatically deleted by \texttt{mrun} from the \texttt{tmp\underline{ }data\underline{ }catalog} in order to save disk space. This can be switched off with \texttt{mrun}-option \grqq\texttt{-k}\grqq (keep data from previous run). \end{itemize} \end{columns} \vspace{2mm} \onslide<4-> \textbf{Example:}\\ \tiny \quad \texttt{\%base\underline{ }data\hspace{4.5em}\~{}/palm/current\underline{ }version/JOBS}\\ \tiny \quad \texttt{\%tmp\underline{ }data\underline{ }catalog\hspace{1.0em}/gfs1/work/niksiraa/palm\underline{ }restart\underline{ }data}\\ \vspace{1mm} \tiny \quad \texttt{BINOUT\hspace{1.0em}out:loc:flpe\hspace{1.0em}restart\hspace{1.0em}\$base\underline{ }data/\$fname/RESTART\hspace{1.0em}\underline{ }d3d}\\ \ \\ \onslide<5-> \scriptsize \textbf{Files (directories) created when using \texttt{-d example\underline{ }cbl}:} \\ \tiny \quad \texttt{/gfs1/work/niksiraa/palm\underline{ }restart\underline{ }data/example\underline{ }cbl\underline{ }d3d}\\ \tiny \quad \texttt{\~{}/palm/current\underline{ }version/JOBS/example/RESTART/example\underline{ }cbl\underline{ }d3d \# empty file (directory)}\\ \vspace{2mm} \onslide<6-> \scriptsize \textcolor{red}{Concerning input files, \texttt{mrun} always determines the current cycle number to be \underline{used from the contents of the directory given by the file connection}\\ \underline{statement!}} \end{frame} % Folie 8 \begin{frame}[fragile] \frametitle{Checking the Restart Job Execution} \tikzstyle{yellow} = [rectangle, draw, fill=yellow!30, text width=1.2\textwidth, font=\Tiny,scale=0.8] \scriptsize \vspace{-40mm} \begin{itemize} \item essentially by looking at the messages in the job protocol file: \end{itemize} \centering \begin{tikzpicture}[remember picture, overlay] \node [yellow] (terminal) at (0mm,-30mm) {\begin{lstlisting} *** execution starts in directory "/gfs1/work/nikleboe/nikleboe.21239" ---------------------------------------------------------------------------- . . run will be terminated due to user settings of restart_time / dt_restart new restart time is: 3600. s . . ---------------------------------------------------------------------------- *** execution finished . . ---------------------------------------------------------------------------- *** all OUTPUT-files saved *** initiating restart-run on "130.75.105.111" using command: mrun -c.mrun.config -dexample_cbl -hlccrayh -Hlcmuk -m1500 -t3600 -qmpp1testq -R130.75.105.111 -Uboeske ... ---------------------------------------------------------------------------- *** ssh will be used to initiate restart-runs! . . *** MRUN 2.1 Rev: 1358 $ will be executed. Please wait ... #------------------------------------------------------------------------# | MRUN 2.1 Rev: 1358 $ Thu Jun 31 14:09:30 CEST 2014 | | | | called on: vaudaire | . . | Files to be compiled: | | palm.f90 user_example.f90 | #------------------------------------------------------------------------# . . ---------------------------------------------------------------------------- *** restart-run initiated --> all actions finished Bye, bye nikleboe !! \end{lstlisting} }; \node[rectangle, draw,text width=0.29\textwidth, fill=white] at (35mm,-5mm) {\noindent \scriptsize In this example, restart time has been set manually by the user.}; \end{tikzpicture} \end{frame} % Folie 9 \begin{frame} \frametitle{Setting the Restart Time Manually} \scriptsize \begin{columns} \column{1.07\textwidth} \begin{itemize} \item<1-> By default, PALM checks after every timestep, if enough time remains from the job's cpu limit to carry out the next timestep:\\ \vspace{1mm} (\quad \textcolor{red}{\grqq\texttt{total job time}\grqq\,} - \grqq\texttt{time already consumed}\grqq\,) \texttt{<=} \textcolor{blue}{\texttt{termination\underline{ }time\underline{ }needed}}\\ (\textcolor{red}{as given by \texttt{mrun}-option \texttt{-t} ...}) \hspace{5mm} (\textcolor{blue}{as given by parameter in \texttt{\&d3par}-NAMELIST})\\ \vspace{3mm} \item<2-> \texttt{termination\underline{ }time\underline{ }needed} has to include the cpu time needed before running PALM (e.g. for compilation, copying of input data, etc.; default value: 300 s)!\\ \ \\ \onslide<3-> \textbf{Warning:}\\ \vspace{1mm} \quad \quad \grqq\texttt{total job time}\grqq\, \texttt{<=} \texttt{termination\underline{ }time\underline{ }needed},\\ \quad forces a restart after every timestep! \vspace{3mm} \item<4-> \texttt{\&d3par}-parameters \texttt{restart\underline{ }time} and \texttt{dt\underline{ }restart} can be used to set restart time(s) manually.\\ \vspace{3mm} \item<5-> In case of manually setting the restart time, the default checking (see above) is still active and a restart will be automatically forced if the job reaches its cpu limit, even if the manually set restart time has not been reached!\\ \end{itemize} \end{columns} \end{frame} % Folie 10 \begin{frame} \frametitle{Starting Restart Jobs Manually} \scriptsize \begin{itemize} \item<1-> After a job has finished (\texttt{end\underline{ }time} has been reached), the user can submit a restart job manually (provided that restart data have been saved) by entering:\\ \vspace{2mm} \quad \texttt{mrun ... -r \dq d3f ...\dq\, ...}\\ or\\ \quad \texttt{mrun ... -r \dq d3f restart ...\dq\, ...}\\ \ \\ \item<2-> Remember to increase the value of \texttt{end\underline{ }time} in the parameter file before submitting the job. \vspace{2mm} \item<3-> If a manually started restart job shall continue a run of a former job chain which is somewhere in the middle of this chain, all binary files with respective higher cycle numbers have to be deleted or removed from their respective directories. \end{itemize} \end{frame} \end{document}