source: palm/trunk/TUTORIAL/SOURCE/parallelization.tex @ 1706

Last change on this file since 1706 was 1531, checked in by keck, 10 years ago

several updates in the tutorial

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 26.6 KB
Line 
1% $Id: parallelization.tex 1531 2015-01-26 13:58:29Z maronga $
2\input{header_tmp.tex}
3%\input{../header_lectures.tex}
4
5\usepackage[utf8]{inputenc}
6\usepackage{ngerman}
7\usepackage{pgf}
8\usepackage{subfigure}
9\usepackage{units}
10\usepackage{multimedia}
11\usepackage{hyperref}
12\newcommand{\event}[1]{\newcommand{\eventname}{#1}}
13\usepackage{xmpmulti}
14\usepackage{tikz}
15\usetikzlibrary{shapes,arrows,positioning}
16%\usetikzlibrary{decorations.markings}             %neues paket
17%\usetikzlibrary{decorations.pathreplacing}        %neues paket
18\def\Tiny{\fontsize{4pt}{4pt}\selectfont}
19\usepackage{amsmath}
20\usepackage{amssymb}
21\usepackage{multicol}
22\usepackage{pdfcomment}
23\usepackage{graphicx}
24\usepackage{listings}
25\lstset{showspaces=false,language=fortran,basicstyle=
26        \ttfamily,showstringspaces=false,captionpos=b}
27
28\institute{Institute of Meteorology and Climatology, Leibniz UniversitÀt Hannover}
29\selectlanguage{english}
30\date{last update: \today}
31\event{PALM Seminar}
32\setbeamertemplate{navigation symbols}{}
33
34\setbeamertemplate{footline}
35  {
36    \begin{beamercolorbox}[rightskip=-0.1cm]&
37     {\includegraphics[height=0.65cm]{imuk_logo.pdf}\hfill \includegraphics[height=0.65cm]{luh_logo.pdf}}
38    \end{beamercolorbox}
39    \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,
40      leftskip=.3cm,rightskip=0.3cm plus1fil]{title in head/foot}
41      {\leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor} \hfill \eventname \hfill \insertframenumber \; / \inserttotalframenumber}
42    \end{beamercolorbox}
43    \begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot}
44    \end{beamercolorbox}
45  }
46%\logo{\includegraphics[width=0.3\textwidth]{luhimuk_logo.pdf}}
47
48\title[Parallelization]{Parallelization}
49\author{PALM group}
50
51\begin{document}
52
53% Folie 1
54\begin{frame}
55   \titlepage
56\end{frame}
57
58\section{Parallelization}
59\subsection{Parallelization}
60
61% Folie 2
62\begin{frame}
63   \frametitle{Basics of Parallelization}
64   \tikzstyle{yellow} = [rectangle,  fill=yellow!20, text width=0.4\textwidth, font=\tiny]
65   \scriptsize
66   \textbf{Parallelization:}
67   \begin{itemize}
68      \item<2-> All processor elements (PE, core) are carrying out the same program code (SIMD).
69      \item<3-> Each PE of a parallel computer operates on a different set of data.
70   \end{itemize}
71
72   \ \\
73   \onslide<4->\textbf{Realization:}
74   \begin{columns}[T]
75      \begin{column}{0.45\textwidth}
76         \onslide<5->each PE solves the equations for a different subdomain of the total domain
77         \begin{center}
78            \includegraphics[width=0.3\textwidth]{parallelization_figures/subdomain_folie2.png}
79         \end{center}
80         \onslide<7->each PE only knows the variable values from its subdomain, communication / data exchange between PEs is necessary\\
81         \onslide<9->\textbf{message passing model (MPI)}
82      \end{column}
83      \begin{column}{0.45\textwidth}
84         \onslide<6->program loops are parallelized, i.e. each processor solves for a subset of the total index range
85         \begin{center}
86            \begin{tikzpicture}[auto, node distance=0]
87               \node [yellow] (1) {%
88               \texttt{!\$OMP DO}\\
89               \texttt{DO  i = 1, 100}\\
90               \quad $\vdots$\\
91               \texttt{ENDDO}};
92            \end{tikzpicture}
93            \begin{tikzpicture}[auto, node distance=0]
94               \node [yellow] (2) {%
95               \texttt{!\$acc kernels}\\
96               \texttt{DO  i = 1, 100}\\
97               \quad $\vdots$\\
98               \texttt{ENDDO}};
99            \end{tikzpicture}
100         \end{center}
101         \vspace{-1mm}
102         \onslide<8-> parallelization can easily be done by the compiler, if all PEs have access to all variables (shared memory)\\
103         \onslide<10-> \textbf{shared memory model (OpenMP)}
104         \onslide<10-> \textbf{accelerator model (OpenACC)}
105      \end{column}
106   \end{columns}
107\end{frame}
108
109% Folie 3
110\begin{frame}
111   \frametitle{Basic Architectures of Massively Parallel Computers}
112   \tikzstyle{info} = [rectangle, text width=0.25\textwidth, font=\scriptsize]
113   
114   
115   \begin{center}
116      \begin{tikzpicture}
117
118         \node (center) at (0,1) {};
119         \onslide<2-> \node (Network) at (-3.5,1) [draw, ellipse,fill=green!20] {Network};
120         \node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-XC30)};
121         \onslide<3-> \node (add_mem) at (3.5,1) [rectangle, draw] {adressable memory};
122         \node (sha_mem) at (3.5,-1) [text width=0.35\textwidth] {\footnotesize \textbf{shared} memory\\(SGI-Altix, multicore PCs)};
123         \onslide<7-> \node (MPI) at (-3.5,-3) [ellipse,fill=yellow!90] {MPI};
124         \onslide<8-> \node (OpenMP) at (3.5,-3) [ellipse,fill=yellow!90, text width=0.13\textwidth] {\footnotesize OpenMP OpenACC};         
125         \onslide<6-> \node (clustered_systems) at (0,-3) [draw, text width=0.15\textwidth] {clustered systems};
126         \node (cs_info) at (0,-4.2) [text width=0.4\textwidth] {\footnotesize (IBM-Regatta, Linux-Cluster,
127            NEC-SX, SGI-ICE, Cray-XC)};
128
129% Adressable memory node (big)
130
131         \onslide<3-> \node (p1) at (2,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
132         \node (p2) at (2.6,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
133         \node (p3) at (3.2,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
134         \node (p4) at (3.8,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
135         \node (p5) at (4.4,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
136         \node (p6) at (5,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
137         
138         \draw[-] (3.5,0.7) -- (3.5,0.4);
139         \draw[-] (2,0.4) -- (5,0.4);
140         \draw[-] (2,0.4) -- (p1);
141         \draw[-] (2.6,0.4) -- (p2);
142         \draw[-] (3.2,0.4) -- (p3);         
143         \draw[-] (3.8,0.4) -- (p4);
144         \draw[-] (4.4,0.4) -- (p5);         
145         \draw[-] (5,0.4) -- (p6);
146         
147% Adressable memory node (small)   
148         \onslide<4->
149           
150         \node (small_node) at (-2,0.6) [scale=0.2] {%
151            \begin{tikzpicture}
152
153               \node (add_mem_small) at (3.5,0.9) [ultra thick, rectangle, draw, minimum width=3cm] {};
154
155               \node (p1_small) at (2,-0.05) [ultra thick, draw,circle, scale=0.9] {};
156               \node (p2_small) at (2.6,-0.05) [ultra thick, draw,circle, scale=0.9] {};
157               \node (p3_small) at (3.2,-0.05) [ultra thick, draw,circle, scale=0.9] {};
158               \node (p4_small) at (3.8,-0.05) [ultra thick, draw,circle, scale=0.9] {};
159               \node (p5_small) at (4.4,-0.05) [ultra thick, draw,circle, scale=0.9] {};
160               \node (p6_small) at (5,-0.05) [ultra thick, draw,circle, scale=0.9] {};
161           
162               \draw[-, ultra thick] (add_mem_small.south) -- (3.5,0.4);
163               \draw[-, ultra thick] (2,0.4) -- (5,0.4);
164               \draw[-, ultra thick] (2,0.4) -- (p1_small);
165               \draw[-, ultra thick] (2.6,0.4) -- (p2_small);
166               \draw[-, ultra thick] (3.2,0.4) -- (p3_small);         
167               \draw[-, ultra thick] (3.8,0.4) -- (p4_small);
168               \draw[-, ultra thick] (4.4,0.4) -- (p5_small);         
169               \draw[-, ultra thick] (5,0.4) -- (p6_small);
170               
171               
172            \end{tikzpicture}
173            } ;
174           
175         \draw[->, thick] (1.5,0.2) -- (small_node) ; 
176         \draw[-] (-2.7,0.75) -- (-2.3,0.725);
177         \onslide<5->
178         \node[below=-0.1cm of small_node] (add_info) [scale=0.9] {\scriptsize node};
179
180% Black Arrows
181         \onslide<6-> \draw[->, thick] (-2.5,-1.5) -- (-0.8,-2.2) ;
182         \draw[->, thick] (2.5,-1.5) -- (0.8,-2.2) ;
183
184% MPI Arrows
185         \onslide<7-> \draw[->, ultra thick, color=yellow] (-3.5,-2.7) -- (-3.5,-1.5) ;
186         \draw[->, ultra thick, color=yellow] (-2.9,-3) -- (-1.0,-3.0) ;
187         \draw[->, ultra thick, color=yellow] (-3.0,-2.8) -- (1.5,-1.0) ;
188
189% OpenMP Arrows         
190         \onslide<8-> \draw[->, ultra thick, color=yellow] (3.5,-2.6) -- (3.5,-1.5) ;
191         \draw[->, ultra thick, color=yellow] (2.5,-2.8) -- (-2.0,0.1) ;
192         
193% Network decorations
194         \onslide<2-> \node (pr1) at (-4.6,0.7) [draw,circle,scale=0.5] {};
195         \node (mem1) at (-4.6,0.45) [draw,rectangle,scale=0.5] {};
196         \draw[-] (-4.5,0.9) -- (pr1);
197         \draw[-] (mem1) -- (pr1);
198         \draw[-] ([xshift=0.02cm]pr1.east) -- ([xshift=0.3cm, yshift=-0.2cm]pr1.east);
199         \draw[-] ([xshift=0.02cm]mem1.east) -- ([xshift=0.3cm, yshift=-0.2cm]mem1.east); 
200         \node at (-3.7,0.45) {\tiny processor};   
201         \node at (-3.3,0.25) {\tiny adressable memory};   
202         
203         \node (pr2) at (-4.6,1.3) [draw,circle,scale=0.5] {};
204         \node (mem2) at (-4.6,1.55) [draw,rectangle,scale=0.5] {};
205         \draw[-] (-4.5,1.1) -- (pr2);
206         \draw[-] (mem2) -- (pr2);
207         \node (pr3) at (-3.9,1.5) [draw,circle,scale=0.5] {};
208         \node (mem3) at (-3.9,1.75) [draw,rectangle,scale=0.5] {};
209         \draw[-] (-3.8,1.3) -- (pr3);
210         \draw[-] (mem3) -- (pr3);       
211         \node (pr4) at (-4.9,0.95) [draw,circle,scale=0.5] {};
212         \node (mem4) at (-4.9,0.7) [draw,rectangle,scale=0.5] {};
213         \draw[-] (-4.55,1.0) -- (pr4);
214         \draw[-] (mem4) -- (pr4);   
215         \node (pr5) at (-3.0,1.5) [draw,circle,scale=0.5] {};
216         \node (mem5) at (-3.0,1.75) [draw,rectangle,scale=0.5] {};
217         \draw[-] (-3.08,1.3) -- (pr5);
218         \draw[-] (mem5) -- (pr5);   
219         \node (pr6) at (-2.2,1.1) [draw,circle,scale=0.5] {};
220         \node (mem6) at (-2.2,1.35) [draw,rectangle,scale=0.5] {};
221         \draw[-] (-2.45,1.0) -- (pr6);
222         \draw[-] (mem6) -- (pr6);   
223               
224         \onslide<1->            
225      \end{tikzpicture}
226   \end{center}
227
228\end{frame}
229
230% Folie 4
231\begin{frame}
232   \frametitle{PALM Parallelization Model}
233   \scriptsize
234   \onslide<2-> \textbf{General demands for a parallelized program:}
235   \begin{itemize}
236      \item<3-> Load balancing
237      \item<4-> Small communication overhead
238      \item<5-> Scalability (up to large numbers of processors)
239   \end{itemize}
240   \vspace{2mm}
241   \onslide<6-> \textbf{The basic parallelization method used for PALM is a 2D-domain decomposition along $x$ and $y$:}\\
242   \begin{columns}[T]
243      \begin{column}{0.3\textwidth}
244         \onslide<7-> \includegraphics[width=1.0\textwidth]{parallelization_figures/subdomain.png}
245      \end{column}
246      \begin{column}{0.6\textwidth}
247         \ \\
248         \onslide<8-> contiguous data in memory (FORTRAN):\\
249         \ \\
250         \ \\
251         \vspace*{-0.05cm}
252         \textcolor{blue}{columns of i}\\
253         \vspace*{-0.04cm}
254         \textcolor{red}{no contiguous data at all}\\
255         \vspace*{0.17cm}
256         \onslide<9-> \textcolor{blue}{columns of k}\\
257         \vspace*{-0.04cm}
258         \textcolor{red}{planes of k,j (all data contiguous)}
259      \end{column}
260   \end{columns}   
261   \vspace{2mm}
262   \begin{itemize}
263      \item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used.
264      \vspace{2mm}
265      \item<11-> Message passing is realized using MPI.
266      \vspace{2mm}
267      \item<12-> OpenMP parallelization as well as mixed usage of OpenMP and
268                    MPI is realized.
269   \end{itemize}
270\end{frame}
271
272% Folie 5
273\begin{frame}
274   \frametitle{Implications of Decomposition}
275   \scriptsize
276   \begin{columns}[T]
277      \begin{column}{0.5\textwidth}
278         \begin{itemize}
279            \item<2-> Central finite differences cause \textcolor{red}{local data dependencies}\\
280            \ \\
281            solution:  introduction of \textcolor{red}{ghost points}
282            \vspace{5mm}
283           
284            \begin{flushright}
285               \onslide<3-> $\left. \dfrac{\partial \psi}{\partial x} \right|_i = \dfrac{\psi_{i+1} - \psi_{i-1}}{2 \Delta x}$
286            \end{flushright}
287            \vspace{10mm}
288            \item<4-> FFT and linear equation solver cause \textcolor{red}{non-local data dependencies}\\
289            \ \\
290            solution: transposition of 3D-arrays
291         \end{itemize}
292      \end{column}
293      \begin{column}{0.6\textwidth}
294      \begin{center}
295         \vspace{-5mm}
296         \onslide<3-> \includegraphics[width=0.7\textwidth]{parallelization_figures/ghost_points.png}
297         \vspace{4mm}
298         \onslide<5-> \includegraphics[width=0.8\textwidth]{parallelization_figures/fft.png} \end{center}
299         \vspace{-4mm}
300         \textbf{Example: transpositions for solving the Poisson\\ \hspace{4.1em}equation}
301      \end{column}
302   \end{columns}   
303\end{frame} 
304
305% Folie 6
306\begin{frame}
307   \frametitle{How to Use the Parallelized Version of PALM}   
308   \scriptsize
309   \begin{columns}[T]
310      \begin{column}{1.12\textwidth}
311         \begin{itemize} 
312            \item<1-> The parallel version of PALM is switched on by \texttt{mrun}-option ''\texttt{-K parallel}''. Additionally, the number of required processors and the number of tasks per node (number of PEs to be used on one node) have to be provided:\\
313                 \quad \texttt{mrun ... -K parallel -X64 -T8 ...}
314                 \item<2-> From an accounting point of view, it is always most efficient to use all PEs of a node (e.g. \texttt{-T8}) (in case of a ''non-shared'' usage of nodes).
315                 \item<3-> If a normal unix-kernel operating system (not a micro-kernel) is running on each CPU, then there migth be a speed-up of the code, if 1-2 PEs less than the total number of PEs on the node are used.
316                 \item<4-> On machines with a comparably slow network, a 1D-decomposition (along $x$) should be used, because then only two transpositions have to be carried out by the pressure solver. A 1D-decomposition is automatically used for NEC-machines (e.g.  \texttt{-h necriam}). The virtual processor grid to be used can be set manually by d3par-parameters \texttt{npex} and \texttt{npey}.
317            \item<5-> Using the Open-MP parallelization does not yield any advantage over using a pure domain decomposition with MPI (contrary to expectations, it mostly slows down the computational speed), but this may change on cluster systems for very large number of processors ($>$10000?) or with Intel-Xeon-Phi accelerator boards.\\       
318         \end{itemize}
319         \begin{center}
320         \vspace{-3mm}
321         \onslide<4-> \includegraphics[width=0.13\textwidth]{parallelization_figures/folie_6.png}
322         \end{center}
323      \end{column}
324   \end{columns} 
325\end{frame}
326
327% Folie 7
328\begin{frame}
329   \frametitle{MPI Communication}   
330   \scriptsize
331   \begin{columns}[T]
332      \begin{column}{1.12\textwidth}
333         \begin{itemize}
334            \item<1-> MPI (message passing interface) is a portable interface for communication between PEs (FORTRAN or C library).
335            \vspace{2mm}
336            \item<2-> MPI on the Cray-XC30 of HLRN-III is provided with module \texttt{PrgEnv-cray} which is loaded by default.
337            \vspace{2mm}
338                 \item<3-> All MPI calls must be within\\
339                 \quad \texttt{CALL MPI\_INIT( ierror )}\\
340                 \quad $\vdots$\\
341            \quad \texttt{CALL MPI\_FINALIZE( ierror )}\\
342         \end{itemize}
343         
344      \end{column}
345   \end{columns} 
346\end{frame}
347
348% Folie 8
349\begin{frame}
350   \frametitle{Communication Within PALM}   
351   \small
352   \begin{itemize}
353      \item<1-> MPI calls within PALM are available when using the \texttt{mrun}-option ''\texttt{-K parallel}''.
354      \item<2-> Communication is needed for
355      \begin{itemize}
356         \footnotesize
357         \item<2-> exchange of ghost points
358         \item<3-> transpositions (FFT-poisson-solver)
359         \item<4-> calculating global sums (e.g. for calculating horizontal averages)
360      \end{itemize}
361      \item<5-> Additional MPI calls are required to define the so-called virtual processor grid and to define special data types needed for more comfortable exchange of data.
362   \end{itemize}
363\end{frame}
364
365% Folie 9
366\begin{frame}
367   \frametitle{Virtual Processor Grid Used in PALM}   
368   \scriptsize
369   \vspace{2mm}
370   The processor grid and special data types are defined in file \texttt{init\_pegrid.f90}\\
371   \ \\
372   \begin{itemize}
373      \item<2-> PALM uses a two-dimensional virtual processor grid (in case of a 1D-decomposition, it has only one element along $y$). It is defined by a so called communicator (here: \texttt{comm2d}):\\
374      \tiny
375      \vspace{1.5mm}
376      \quad \texttt{ndim = 2}\\
377           \quad \texttt{pdims(1) = npex  \quad  ! \# of processors along x}\\
378           \quad \texttt{pdims(2) = npey  \quad  ! \# of processors along y}\\
379           \quad \texttt{cyclic(1) = .TRUE.}\\
380           \quad \texttt{cyclic(2) = .TRUE.}\\
381      \ \\
382           \quad \texttt{CALL MPI\underline{\ }CART\underline{\ }CREATE( MPI\underline{\ }COMM\underline{\ }WORLD, ndim, pdims, cyclic, reorder, \&}\\
383           \quad \texttt{\hspace{10.5em} \textcolor{blue}{comm2d}, ierr )} 
384           \scriptsize
385      \vspace{4mm} 
386      \item<3-> The processor number (id) with respect to this processor grid, \texttt{myid}, is given by:\\
387      \tiny
388      \vspace{1.5mm}
389      \quad \texttt{CALL MPI\underline{\ }COMM\underline{\ }RANK( comm2d, \textcolor{blue}{myid}, ierr )}   
390      \scriptsize   
391      \vspace{4mm}
392      \item<4-> The ids of the neighbouring PEs are determined by:\\
393      \tiny
394      \vspace{1.5mm}
395      \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 0, 1, \textcolor{blue}{pleft}\textcolor{blue}{pright}, ierr )}\\
396      \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 1, 1, \textcolor{blue}{psouth}, \textcolor{blue}{pnorth}, ierr )}\\
397   \end{itemize}
398\end{frame}
399
400% Folie 10
401\begin{frame}
402   \frametitle{Exchange of ghost points}   
403   \scriptsize
404         \begin{itemize}
405            \item<1-> Ghost points are stored in additional array elements added at the horizontal boundaries of the subdomains, e.g.\\
406            \tiny
407            \vspace{2mm}
408            \quad \texttt{u(:,:,nxl\textcolor{blue}{-nbgp}), u(:,:,nxr\textcolor{blue}{+nbgp}) ! left and right boundary}\\
409            \quad \texttt{u(:,nys\textcolor{blue}{-nbgp},:), u(:,nyn\textcolor{blue}{+nbgp},:) ! south and north boundary}\\
410            \vspace{1mm}
411            \scriptsize The actual code uses \texttt{\textcolor{blue}{nxlg}=nxl\textcolor{blue}{-nbgp}}, etc...\\
412            \vspace{2mm}
413            \item<2-> \scriptsize The exchange of ghost points is done in file \texttt{exchange\underline{\ }horiz.f90}\\
414            \textbf{\underline{Simplified} example:} synchroneous exchange of ghost points along $x$ ($yz$-planes, send left, receive right plane):\\
415            \tiny
416            \vspace{2mm}
417            \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,\textcolor{blue}{nysg},nxl),   ngp\underline{\ }yz, MPI\underline{\ }REAL, pleft,  0,}\\
418            \quad \texttt{\hspace{9.5em}ar(nzb,\textcolor{blue}{nysg},nxr+1), ngp\underline{\ }yz, MPI\underline{\ }REAL, pright, 0,}\\
419            \quad \texttt{\hspace{9.5em}comm2d, status, ierr )}\\
420            \vspace{2mm}
421            \item<3-> \scriptsize In the real code special MPI data types (vectors) are defined for exchange of $yz$/$xz$-planes for performance reasons and because array elements to be exchanged are not consecutively stored in memory for $xz$-planes:\\
422            \tiny
423            \vspace{2mm}
424            \quad \texttt{ngp\underline{\ }yz(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * \textcolor{blue}{nbgp} )}\\
425            \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }VECTOR( \textcolor{blue}{nbgp}, ngp\underline{\ }yz(0), ngp\underline{\ }yz(0), MPI\underline{\ }REAL, type\underline{\ }yz(0), ierr )}\\ 
426            \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ }yz(0), ierr )   ! see file init\underline{\ }pegrid.f90}\\
427            \ \\
428            \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,\textcolor{blue}{nysg},nxl), 1, type\underline{\ }yz(grid\underline{\ }level), pleft, 0, ...}\\
429         \end{itemize}       
430\end{frame}
431
432% Folie 11
433\begin{frame}
434   \frametitle{Transpositions}   
435   \footnotesize
436    \begin{columns}[T]
437      \begin{column}{1.05\textwidth}
438         \begin{itemize}
439            \item<1-> Transpositions can be found in file \texttt{transpose.f90} (several subroutines for 1D- or 2D-decompositions; they are called mainly from the FFT pressure solver, see \texttt{poisfft.f90}.\\
440            \ \\
441            \item<2-> The following example is for a transposition from $x$ to $y$, i.e. for the input array all data elements along $x$ reside on the same PE, while after the transposition, all elements along $y$ are on the same PE:\\
442            \ \\
443            \tiny
444            \texttt{!}\\
445            \texttt{!--   in SUBROUTINE transpose\underline{\ }xy:}\\
446           \texttt{CALL MPI\underline{\ }ALLTOALL( f\underline{\ }inv(nys\underline{\ }x,nzb\underline{\ }x,0), \hspace{1em}sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\
447           \texttt{\hspace{9.5em}work(1,nzb\underline{\ }y, nxl\underline{\ }y,0), sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\
448           \texttt{\hspace{9.5em}comm1dy, ierr )}\\
449           \ \\
450           \item<3-> The data resorting before and after the calls of MPI\_ALLTOALL is highly optimized to account for the different processor architectures and even allows for overlapping communication and calculation.
451         \end{itemize}
452      \end{column}
453   \end{columns} 
454\end{frame}
455
456% Folie 12
457\begin{frame}
458   \frametitle{Parallel I/O} 
459   \scriptsize
460   \vspace{-2mm}
461   \begin{columns}[T]
462      \begin{column}{1.1\textwidth}
463         \begin{itemize}
464            \item<1-> PALM writes and reads some of the input/output files in parallel, i.e. each processor writes/reads his own file. \textbf{Each file then has a different name!}\\
465            \ \\
466            \textbf{Example:} binary files for restart are written into a subdirectory of the PALM working directory:\\
467            \quad \texttt{BINOUT/\_0000}\\
468                 \quad \texttt{BINOUT/\_0001}\\
469                 \quad $\vdots$
470                 \item<2-> These files can be handled (copied) by \texttt{mrun} using the file attribute \texttt{pe} in the configuration file:\\
471                 \texttt{BINOUT  out:loc:pe restart \~{}/palm/current\underline{\ }version/JOBS/\$fname/RESTART  \underline{\ }d3d}\\
472                 \ \\
473                 \onslide<3->In this case, filenames are interpreted as directory names.
474                 An \texttt{mrun} call using option\\
475                 ''\texttt{-d example\underline{\ }cbl -r restart}'' will copy the local \textbf{\underline{directory}} \texttt{BINOUT} to the \textbf{\underline{directory}} \texttt{.../RESTART/example\underline{\ }cbl\underline{\ }d3d}  .
476         \end{itemize}
477         \onslide<4-> \textbf{General comment:}
478         \begin{itemize}
479            \item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-option \texttt{-w})
480         \end{itemize}
481      \end{column}
482   \end{columns} 
483\end{frame}
484
485
486
487%Folie 13
488\begin{frame}
489   \frametitle{PALM Parallel I/O for 2D/3D Data} 
490   \footnotesize
491   \begin{itemize}
492      \item<1-> 2D- and 3D-data output is also written in parallel by the processors (2D: by default, 3D: generally).
493      \item<2-> Because the graphics software (\texttt{ncview}, \texttt{ncl}, \texttt{ferret}, etc.) expect the data to be in one file, these output files have to be merged to one single file after PALM has finished.\\
494      \ \\
495      This is done within the job by calling the utility program \texttt{combine\underline{\ }plot\underline{\ }fields.x} after PALM has successfully finished.
496      \item<3-> \texttt{combine\underline{\ }plot\underline{\ }fields.x} is automatically executed by \texttt{mrun}.
497      \item<4-> The executable \texttt{combine\underline{\ }plot\underline{\ }fields.x} is created during the installation process by the command\\
498      \ \\
499      \quad \texttt{mbuild -u -h <host identifier>}
500   \end{itemize}
501\end{frame}
502
503%Folie 14
504\begin{frame}
505   \frametitle{PALM Parallel I/O for 2D/3D Data with netCDF4/HDF5} 
506   \footnotesize
507   \begin{itemize}
508      \item<1-> The Cray XC30 of HLRN-III allows direct parallel I/O to a netCDF file
509      \vspace{2mm}
510      \item<2-> modules \texttt{cray\_hdf5\_parallel} and \texttt{cray\_netcdf\_hdf5parallel} have to be loaded
511      \vspace{2mm}
512      \item<3-> cpp-switches \texttt{-D\_\_netcdf}, \texttt{-D\_\_netcdf4}, \texttt{-D\_\_netcdf4\_parallel} have to be set
513      \vspace{2mm}
514      \item<4-> Both is done in the default HLRN-III block of the configuration file (\texttt{lccrayh})
515      \vspace{2mm}
516      \item<5-> \texttt{d3par}-parameter \texttt{netcdf\_data\_format=5} has to be set in the parameter file
517      \vspace{2mm}
518      \item<6-> \texttt{combine\_plot\_fields.x} is not required in this case
519   \end{itemize}
520\end{frame}
521
522%Folie 15
523\begin{frame}
524   \frametitle{Performance Examples (I)} 
525   \begin{itemize}
526      \item Simulation using 1536 * 768 * 242 grid points  ($\sim$ 60 GByte)
527   \end{itemize}
528   \includegraphics[scale=0.28]{parallelization_figures/perf_left.png} 
529   \includegraphics[scale=0.28]{parallelization_figures/perf_right.png}
530   \includegraphics[scale=0.28]{parallelization_figures/legende.png}  \\
531   \scriptsize
532      \begin{columns}[T]
533         \begin{column}{0.18\textwidth}
534         \end{column}
535         \begin{column}{0.4\textwidth}
536            IBM-Regatta, HLRN, Hannover\\
537            (1D domain decomposition)
538         \end{column}
539         \begin{column}{0.4\textwidth}
540            Sun Fire X4600, Tokyo Institute of Technology\\
541(2D domain decomposition)
542         \end{column} 
543         \begin{column}{0.2\textwidth}
544         \end{column}           
545         
546      \end{columns}
547
548\end{frame}
549
550%Folie 16
551\begin{frame}
552   \frametitle{Performance Examples (II)} 
553   \begin{itemize}
554      \item Simulation with $2048^3$ grid points  ($\sim$ 2 TByte memory)
555   \end{itemize}
556      \begin{columns}[T]
557         \begin{column}{0.5\textwidth}
558            \includegraphics[scale=0.25]{parallelization_figures/perf_3.png} \\
559            \scriptsize
560            \quad SGI-ICE2, HLRN-II, Hannover\\
561            \quad (2D-domain decomposition)
562         \end{column}
563         \begin{column}{0.5\textwidth}
564            \vspace{35mm}
565            \onslide<2-> largest simulation feasible on that system:\\
566            \ \\
567            $4096^3$ grid points
568         \end{column} 
569      \end{columns}
570\end{frame}
571
572%Folie 17
573\begin{frame}
574   \frametitle{Performance Examples (III)} 
575   \begin{itemize}
576      \item Simulation with $4320^3$ grid points  ($\sim$ 13 TByte memory)
577   \end{itemize}
578      \begin{columns}[T]
579         \begin{column}{0.5\textwidth}
580            \includegraphics[scale=0.5]{parallelization_figures/perf_4.png} \\
581            \scriptsize
582            \quad Cray-XC40, HLRN-III, Hannover\\
583            \quad (2D-domain decomposition)
584         \end{column}
585         \begin{column}{0.5\textwidth}
586            \vspace{35mm}
587            \onslide<2-> currently largest simulation feasible on that system:\\
588            \ \\
589            $5600^3$ grid points
590         \end{column} 
591      \end{columns}
592\end{frame}
593
594\end{document}
Note: See TracBrowser for help on using the repository browser.