Home

Context Navigation

source: palm/trunk/TUTORIAL/SOURCE/parallelization.tex @ 1515

Last change on this file since 1515 was 1515, checked in by boeske, 10 years ago
several updates in the tutorial
Property svn:executable set to ``* Property svn:keywords set to `Id`
File size: 26.5 KB

Line
1	% $Id: parallelization.tex 1515 2015-01-02 11:35:51Z boeske $
2	\input{header_tmp.tex}
3	%\input{../header_lectures.tex}
4
5	\usepackage[utf8]{inputenc}
6	\usepackage{ngerman}
7	\usepackage{pgf}
8	\usepackage{subfigure}
9	\usepackage{units}
10	\usepackage{multimedia}
11	\usepackage{hyperref}
12	\newcommand{\event}[1]{\newcommand{\eventname}{#1}}
13	\usepackage{xmpmulti}
14	\usepackage{tikz}
15	\usetikzlibrary{shapes,arrows,positioning}
16	%\usetikzlibrary{decorations.markings} %neues paket
17	%\usetikzlibrary{decorations.pathreplacing} %neues paket
18	\def\Tiny{\fontsize{4pt}{4pt}\selectfont}
19	\usepackage{amsmath}
20	\usepackage{amssymb}
21	\usepackage{multicol}
22	\usepackage{pdfcomment}
23	\usepackage{graphicx}
24	\usepackage{listings}
25	\lstset{showspaces=false,language=fortran,basicstyle=
26	\ttfamily,showstringspaces=false,captionpos=b}
27
28	\institute{Institute of Meteorology and Climatology, Leibniz UniversitÃ€t Hannover}
29	\selectlanguage{english}
30	\date{last update: \today}
31	\event{PALM Seminar}
32	\setbeamertemplate{navigation symbols}{}
33
34	\setbeamertemplate{footline}
35	{
36	\begin{beamercolorbox}[rightskip=-0.1cm]&
37	{\includegraphics[height=0.65cm]{imuk_logo.pdf}\hfill \includegraphics[height=0.65cm]{luh_logo.pdf}}
38	\end{beamercolorbox}
39	\begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,
40	leftskip=.3cm,rightskip=0.3cm plus1fil]{title in head/foot}
41	{\leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor} \hfill \eventname \hfill \insertframenumber \; / \inserttotalframenumber}
42	\end{beamercolorbox}
43	\begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot}
44	\end{beamercolorbox}
45	}
46	%\logo{\includegraphics[width=0.3\textwidth]{luhimuk_logo.pdf}}
47
48	\title[Parallelization]{Parallelization}
49	\author{PALM group}
50
51	\begin{document}
52
53	% Folie 1
54	\begin{frame}
55	\titlepage
56	\end{frame}
57
58	\section{Parallelization}
59	\subsection{Parallelization}
60
61	% Folie 2
62	\begin{frame}
63	\frametitle{Basics of Parallelization}
64	\tikzstyle{yellow} = [rectangle, fill=yellow!20, text width=0.4\textwidth, font=\tiny]
65	\scriptsize
66	\textbf{Parallelization:}
67	\begin{itemize}
68	\item<2-> All processor elements (PE, core) are carrying out the same program code (SIMD).
69	\item<3-> Each PE of a parallel computer operates on a different set of data.
70	\end{itemize}
71
72	\ \\
73	\onslide<4->\textbf{Realization:}
74	\begin{columns}[T]
75	\begin{column}{0.45\textwidth}
76	\onslide<5->each PE solves the equations for a different subdomain of the total domain
77	\begin{center}
78	\includegraphics[width=0.3\textwidth]{parallelization_figures/subdomain_folie2.png}
79	\end{center}
80	\onslide<7->each PE only knows the variable values from its subdomain, communication / data exchange between PEs is necessary\\
81	\onslide<9->\textbf{message passing model (MPI)}
82	\end{column}
83	\begin{column}{0.45\textwidth}
84	\onslide<6->program loops are parallelized, i.e. each processor solves for a subset of the total index range
85	\begin{center}
86	\begin{tikzpicture}[auto, node distance=0]
87	\node [yellow] (1) {%
88	\texttt{!\$OMP DO}\\
89	\texttt{DO i = 1, 100}\\
90	\quad $\vdots$\\
91	\texttt{ENDDO}};
92	\end{tikzpicture}
93	\begin{tikzpicture}[auto, node distance=0]
94	\node [yellow] (2) {%
95	\texttt{!\$acc kernels}\\
96	\texttt{DO i = 1, 100}\\
97	\quad $\vdots$\\
98	\texttt{ENDDO}};
99	\end{tikzpicture}
100	\end{center}
101	\vspace{-1mm}
102	\onslide<8-> parallelization can easily be done by the compiler, if all PEs have access to all variables (shared memory)\\
103	\onslide<10-> \textbf{shared memory model (OpenMP)}
104	\onslide<10-> \textbf{accelerator model (OpenACC)}
105	\end{column}
106	\end{columns}
107	\end{frame}
108
109	% Folie 3
110	\begin{frame}
111	\frametitle{Basic Architectures of Massively Parallel Computers}
112	\tikzstyle{info} = [rectangle, text width=0.25\textwidth, font=\scriptsize]
113
114
115	\begin{center}
116	\begin{tikzpicture}
117
118	\node (center) at (0,1) {};
119	\onslide<2-> \node (Network) at (-3.5,1) [draw, ellipse,fill=green!20] {Network};
120	\node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-XC30)};
121	\onslide<3-> \node (add_mem) at (3.5,1) [rectangle, draw] {adressable memory};
122	\node (sha_mem) at (3.5,-1) [text width=0.35\textwidth] {\footnotesize \textbf{shared} memory\\(SGI-Altix, multicore PCs)};
123	\onslide<7-> \node (MPI) at (-3.5,-3) [ellipse,fill=yellow!90] {MPI};
124	\onslide<8-> \node (OpenMP) at (3.5,-3) [ellipse,fill=yellow!90, text width=0.13\textwidth] {\footnotesize OpenMP OpenACC};
125	\onslide<6-> \node (clustered_systems) at (0,-3) [draw, text width=0.15\textwidth] {clustered systems};
126	\node (cs_info) at (0,-4.2) [text width=0.4\textwidth] {\footnotesize (IBM-Regatta, Linux-Cluster,
127	NEC-SX, SGI-ICE, Cray-XC)};
128
129	% Adressable memory node (big)
130
131	\onslide<3-> \node (p1) at (2,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
132	\node (p2) at (2.6,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
133	\node (p3) at (3.2,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
134	\node (p4) at (3.8,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
135	\node (p5) at (4.4,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
136	\node (p6) at (5,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
137
138	\draw[-] (3.5,0.7) -- (3.5,0.4);
139	\draw[-] (2,0.4) -- (5,0.4);
140	\draw[-] (2,0.4) -- (p1);
141	\draw[-] (2.6,0.4) -- (p2);
142	\draw[-] (3.2,0.4) -- (p3);
143	\draw[-] (3.8,0.4) -- (p4);
144	\draw[-] (4.4,0.4) -- (p5);
145	\draw[-] (5,0.4) -- (p6);
146
147	% Adressable memory node (small)
148	\onslide<4->
149
150	\node (small_node) at (-2,0.6) [scale=0.2] {%
151	\begin{tikzpicture}
152
153	\node (add_mem_small) at (3.5,0.9) [ultra thick, rectangle, draw, minimum width=3cm] {};
154
155	\node (p1_small) at (2,-0.05) [ultra thick, draw,circle, scale=0.9] {};
156	\node (p2_small) at (2.6,-0.05) [ultra thick, draw,circle, scale=0.9] {};
157	\node (p3_small) at (3.2,-0.05) [ultra thick, draw,circle, scale=0.9] {};
158	\node (p4_small) at (3.8,-0.05) [ultra thick, draw,circle, scale=0.9] {};
159	\node (p5_small) at (4.4,-0.05) [ultra thick, draw,circle, scale=0.9] {};
160	\node (p6_small) at (5,-0.05) [ultra thick, draw,circle, scale=0.9] {};
161
162	\draw[-, ultra thick] (add_mem_small.south) -- (3.5,0.4);
163	\draw[-, ultra thick] (2,0.4) -- (5,0.4);
164	\draw[-, ultra thick] (2,0.4) -- (p1_small);
165	\draw[-, ultra thick] (2.6,0.4) -- (p2_small);
166	\draw[-, ultra thick] (3.2,0.4) -- (p3_small);
167	\draw[-, ultra thick] (3.8,0.4) -- (p4_small);
168	\draw[-, ultra thick] (4.4,0.4) -- (p5_small);
169	\draw[-, ultra thick] (5,0.4) -- (p6_small);
170
171
172	\end{tikzpicture}
173	} ;
174
175	\draw[->, thick] (1.5,0.2) -- (small_node) ;
176	\draw[-] (-2.7,0.75) -- (-2.3,0.725);
177	\onslide<5->
178	\node[below=-0.1cm of small_node] (add_info) [scale=0.9] {\scriptsize node};
179
180	% Black Arrows
181	\onslide<6-> \draw[->, thick] (-2.5,-1.5) -- (-0.8,-2.2) ;
182	\draw[->, thick] (2.5,-1.5) -- (0.8,-2.2) ;
183
184	% MPI Arrows
185	\onslide<7-> \draw[->, ultra thick, color=yellow] (-3.5,-2.7) -- (-3.5,-1.5) ;
186	\draw[->, ultra thick, color=yellow] (-2.9,-3) -- (-1.0,-3.0) ;
187	\draw[->, ultra thick, color=yellow] (-3.0,-2.8) -- (1.5,-1.0) ;
188
189	% OpenMP Arrows
190	\onslide<8-> \draw[->, ultra thick, color=yellow] (3.5,-2.6) -- (3.5,-1.5) ;
191	\draw[->, ultra thick, color=yellow] (2.5,-2.8) -- (-2.0,0.1) ;
192
193	% Network decorations
194	\onslide<2-> \node (pr1) at (-4.6,0.7) [draw,circle,scale=0.5] {};
195	\node (mem1) at (-4.6,0.45) [draw,rectangle,scale=0.5] {};
196	\draw[-] (-4.5,0.9) -- (pr1);
197	\draw[-] (mem1) -- (pr1);
198	\draw[-] ([xshift=0.02cm]pr1.east) -- ([xshift=0.3cm, yshift=-0.2cm]pr1.east);
199	\draw[-] ([xshift=0.02cm]mem1.east) -- ([xshift=0.3cm, yshift=-0.2cm]mem1.east);
200	\node at (-3.7,0.45) {\tiny processor};
201	\node at (-3.3,0.25) {\tiny adressable memory};
202
203	\node (pr2) at (-4.6,1.3) [draw,circle,scale=0.5] {};
204	\node (mem2) at (-4.6,1.55) [draw,rectangle,scale=0.5] {};
205	\draw[-] (-4.5,1.1) -- (pr2);
206	\draw[-] (mem2) -- (pr2);
207	\node (pr3) at (-3.9,1.5) [draw,circle,scale=0.5] {};
208	\node (mem3) at (-3.9,1.75) [draw,rectangle,scale=0.5] {};
209	\draw[-] (-3.8,1.3) -- (pr3);
210	\draw[-] (mem3) -- (pr3);
211	\node (pr4) at (-4.9,0.95) [draw,circle,scale=0.5] {};
212	\node (mem4) at (-4.9,0.7) [draw,rectangle,scale=0.5] {};
213	\draw[-] (-4.55,1.0) -- (pr4);
214	\draw[-] (mem4) -- (pr4);
215	\node (pr5) at (-3.0,1.5) [draw,circle,scale=0.5] {};
216	\node (mem5) at (-3.0,1.75) [draw,rectangle,scale=0.5] {};
217	\draw[-] (-3.08,1.3) -- (pr5);
218	\draw[-] (mem5) -- (pr5);
219	\node (pr6) at (-2.2,1.1) [draw,circle,scale=0.5] {};
220	\node (mem6) at (-2.2,1.35) [draw,rectangle,scale=0.5] {};
221	\draw[-] (-2.45,1.0) -- (pr6);
222	\draw[-] (mem6) -- (pr6);
223
224	\onslide<1->
225	\end{tikzpicture}
226	\end{center}
227
228	\end{frame}
229
230	% Folie 4
231	\begin{frame}
232	\frametitle{PALM Parallelization Model}
233	\scriptsize
234	\onslide<2-> \textbf{General demands for a parallelized program:}
235	\begin{itemize}
236	\item<3-> Load balancing
237	\item<4-> Small communication overhead
238	\item<5-> Scalability (up to large numbers of processors)
239	\end{itemize}
240	\vspace{2mm}
241	\onslide<6-> \textbf{The basic parallelization method used for PALM is a 2D-domain decomposition along $x$ and $y$:}\\
242	\begin{columns}[T]
243	\begin{column}{0.3\textwidth}
244	\onslide<7-> \includegraphics[width=1.0\textwidth]{parallelization_figures/subdomain.png}
245	\end{column}
246	\begin{column}{0.6\textwidth}
247	\ \\
248	\onslide<8-> contiguous data in memory (FORTRAN):\\
249	\ \\
250	\ \\
251	\textcolor{blue}{columns of i}\\
252	\textcolor{red}{no contiguous data at all}\\
253	\onslide<9-> \textcolor{blue}{columns of k}\\
254	\textcolor{red}{planes of k,j (all data contiguous)}
255	\end{column}
256	\end{columns}
257	\vspace{2mm}
258	\begin{itemize}
259	\item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used.
260	\vspace{2mm}
261	\item<11-> Message passing is realized using MPI.
262	\vspace{2mm}
263	\item<12-> OpenMP parallelization as well as mixed usage of OpenMP and
264	MPI is also realized.
265	\end{itemize}
266	\end{frame}
267
268	% Folie 5
269	\begin{frame}
270	\frametitle{Implications of Decomposition}
271	\scriptsize
272	\begin{columns}[T]
273	\begin{column}{0.5\textwidth}
274	\begin{itemize}
275	\item<2-> Central finite differences cause \textcolor{red}{local data dependencies}\\
276	\ \\
277	solution: introduction of \textcolor{red}{ghost points}
278	\vspace{5mm}
279
280	\begin{flushright}
281	\onslide<3-> $\left. \dfrac{\partial \psi}{\partial x} \right\|_i = \dfrac{\psi_{i+1} - \psi_{i-1}}{2 \Delta x}$
282	\end{flushright}
283	\vspace{10mm}
284	\item<4-> FFT and linear equation solver cause \textcolor{red}{non-local data dependencies}\\
285	\ \\
286	solution: transposition of 3D-arrays
287	\end{itemize}
288	\end{column}
289	\begin{column}{0.6\textwidth}
290	\begin{center}
291	\vspace{-5mm}
292	\onslide<3-> \includegraphics[width=0.7\textwidth]{parallelization_figures/ghost_points.png}
293	\vspace{4mm}
294	\onslide<5-> \includegraphics[width=0.8\textwidth]{parallelization_figures/fft.png} \end{center}
295	\vspace{-4mm}
296	\textbf{Example: transpositions for solving the Poisson\\ \hspace{4.1em}equation}
297	\end{column}
298	\end{columns}
299	\end{frame}
300
301	% Folie 6
302	\begin{frame}
303	\frametitle{How to Use the Parallelized Version of PALM}
304	\scriptsize
305	\begin{columns}[T]
306	\begin{column}{1.12\textwidth}
307	\begin{itemize}
308	\item<1-> The parallel version of PALM is switched on by \texttt{mrun}-option ''\texttt{-K parallel}''. Additionally, the number of required processors and the number of tasks per node (number of PEs to be used on one node) have to be provided:\\
309	\quad \texttt{mrun ... -K parallel -X64 -T8 ...}
310	\item<2-> From an accounting point of view, it is always most efficient to use all PEs of a node (e.g. \texttt{-T8}) (in case of a ''non-shared'' usage of nodes).
311	\item<3-> If a normal unix-kernel operating system (not a micro-kernel) is running on each CPU, then there migth be a speed-up of the code, if 1-2 PEs less than the total number of PEs on the node are used.
312	\item<4-> On machines with a comparably slow network, a 1D-decomposition (along $x$) should be used, because then only two transpositions have to be carried out by the pressure solver. A 1D-decomposition is automatically used for NEC-machines (e.g. \texttt{-h necriam}). The virtual processor grid to be used can be set manually by d3par-parameters \texttt{npex} and \texttt{npey}.
313	\item<5-> Using the Open-MP parallelization does not yield any advantage over using a pure domain decomposition with MPI (contrary to expectations, it mostly slows down the computational speed), but this may change on cluster systems for very large number of processors ($>$10000?) or with Intel-Xeon-Phi accelerator boards.\\
314	\end{itemize}
315	\begin{center}
316	\vspace{-3mm}
317	\onslide<4-> \includegraphics[width=0.13\textwidth]{parallelization_figures/folie_6.png}
318	\end{center}
319	\end{column}
320	\end{columns}
321	\end{frame}
322
323	% Folie 7
324	\begin{frame}
325	\frametitle{MPI Communication}
326	\scriptsize
327	\begin{columns}[T]
328	\begin{column}{1.12\textwidth}
329	\begin{itemize}
330	\item<1-> MPI (message passing interface) is a portable interface for communication between PEs (FORTRAN or C library).
331	\vspace{2mm}
332	\item<2-> MPI on the Cray-XC30 of HLRN-III is provided with module \texttt{PrgEnv-cray} which is loaded by default.
333	\vspace{2mm}
334	\item<3-> All MPI calls must be within\\
335	\quad \texttt{CALL MPI\_INIT( ierror )}\\
336	\quad $\vdots$\\
337	\quad \texttt{CALL MPI\_FINALIZE( ierror )}\\
338	\end{itemize}
339
340	\end{column}
341	\end{columns}
342	\end{frame}
343
344	% Folie 8
345	\begin{frame}
346	\frametitle{Communication Within PALM}
347	\small
348	\begin{itemize}
349	\item<1-> MPI calls within PALM are available when using the \texttt{mrun}-option ''\texttt{-K parallel}''.
350	\item<2-> Communication is needed for
351	\begin{itemize}
352	\footnotesize
353	\item<2-> exchange of ghost points
354	\item<3-> transpositions (FFT-poisson-solver)
355	\item<4-> calculating global sums (e.g. for calculating horizontal averages)
356	\end{itemize}
357	\item<5-> Additional MPI calls are required to define the so-called virtual processor grid and to define special data types needed for more comfortable exchange of data.
358	\end{itemize}
359	\end{frame}
360
361	% Folie 9
362	\begin{frame}
363	\frametitle{Virtual Processor Grid Used in PALM}
364	\scriptsize
365	\vspace{2mm}
366	The processor grid and special data types are defined in file \texttt{init\_pegrid.f90}\\
367	\ \\
368	\begin{itemize}
369	\item<2-> PALM uses a two-dimensional virtual processor grid (in case of a 1D-decomposition, it has only one element along $y$). It is defined by a so called communicator (here: \texttt{comm2d}):\\
370	\tiny
371	\vspace{1.5mm}
372	\quad \texttt{ndim = 2}\\
373	\quad \texttt{pdims(1) = npex \quad ! \# of processors along x}\\
374	\quad \texttt{pdims(2) = npey \quad ! \# of processors along y}\\
375	\quad \texttt{cyclic(1) = .TRUE.}\\
376	\quad \texttt{cyclic(2) = .TRUE.}\\
377	\ \\
378	\quad \texttt{CALL MPI\underline{\ }CART\underline{\ }CREATE( MPI\underline{\ }COMM\underline{\ }WORLD, ndim, pdims, cyclic, reorder, \&}\\
379	\quad \texttt{\hspace{10.5em} \textcolor{blue}{comm2d}, ierr )}
380	\scriptsize
381	\vspace{4mm}
382	\item<3-> The processor number (id) with respect to this processor grid, \texttt{myid}, is given by:\\
383	\tiny
384	\vspace{1.5mm}
385	\quad \texttt{CALL MPI\underline{\ }COMM\underline{\ }RANK( comm2d, \textcolor{blue}{myid}, ierr )}
386	\scriptsize
387	\vspace{4mm}
388	\item<4-> The ids of the neighbouring PEs are determined by:\\
389	\tiny
390	\vspace{1.5mm}
391	\quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 0, 1, \textcolor{blue}{pleft}, \textcolor{blue}{pright}, ierr )}\\
392	\quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 1, 1, \textcolor{blue}{psouth}, \textcolor{blue}{pnorth}, ierr )}\\
393	\end{itemize}
394	\end{frame}
395
396	% Folie 10
397	\begin{frame}
398	\frametitle{Exchange of ghost points}
399	\scriptsize
400	\begin{itemize}
401	\item<1-> Ghost points are stored in additional array elements added at the horizontal boundaries of the subdomains, e.g.\\
402	\tiny
403	\vspace{2mm}
404	\quad \texttt{u(:,:,nxl\textcolor{blue}{-nbgp}), u(:,:,nxr\textcolor{blue}{+nbgp}) ! left and right boundary}\\
405	\quad \texttt{u(:,nys\textcolor{blue}{-nbgp},:), u(:,nyn\textcolor{blue}{+nbgp},:) ! south and north boundary}\\
406	\vspace{1mm}
407	\scriptsize The actual code uses \texttt{\textcolor{blue}{nxlg}=nxl\textcolor{blue}{-nbgp}}, etc...\\
408	\vspace{2mm}
409	\item<2-> \scriptsize The exchange of ghost points is done in file \texttt{exchange\underline{\ }horiz.f90}\\
410	\textbf{\underline{Simplified} example:} synchroneous exchange of ghost points along $x$ ($yz$-planes, send left, receive right plane):\\
411	\tiny
412	\vspace{2mm}
413	\quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,\textcolor{blue}{nysg},nxl), ngp\underline{\ }yz, MPI\underline{\ }REAL, pleft, 0,}\\
414	\quad \texttt{\hspace{9.5em}ar(nzb,\textcolor{blue}{nysg},nxr+1), ngp\underline{\ }yz, MPI\underline{\ }REAL, pright, 0,}\\
415	\quad \texttt{\hspace{9.5em}comm2d, status, ierr )}\\
416	\vspace{2mm}
417	\item<3-> \scriptsize In the real code special MPI data types (vectors) are defined for exchange of $yz$/$xz$-planes for performance reasons and because array elements to be exchanged are not consecutively stored in memory for $xz$-planes:\\
418	\tiny
419	\vspace{2mm}
420	\quad \texttt{ngp\underline{\ }yz(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * \textcolor{blue}{nbgp} )}\\
421	\quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }VECTOR( \textcolor{blue}{nbgp}, ngp\underline{\ }yz(0), ngp\underline{\ }yz(0), MPI\underline{\ }REAL, type\underline{\ }yz(0), ierr )}\\
422	\quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ }yz(0), ierr ) ! see file init\underline{\ }pegrid.f90}\\
423	\ \\
424	\quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,\textcolor{blue}{nysg},nxl), 1, type\underline{\ }yz(grid\underline{\ }level), pleft, 0, ...}\\
425	\end{itemize}
426	\end{frame}
427
428	% Folie 11
429	\begin{frame}
430	\frametitle{Transpositions}
431	\footnotesize
432	\begin{columns}[T]
433	\begin{column}{1.05\textwidth}
434	\begin{itemize}
435	\item<1-> Transpositions can be found in file \texttt{transpose.f90} (several subroutines for 1D- or 2D-decompositions; they are called mainly from the FFT pressure solver, see \texttt{poisfft.f90}.\\
436	\ \\
437	\item<2-> The following example is for a transposition from $x$ to $y$, i.e. for the input array all data elements along $x$ reside on the same PE, while after the transposition, all elements along $y$ are on the same PE:\\
438	\ \\
439	\tiny
440	\texttt{!}\\
441	\texttt{!-- in SUBROUTINE transpose\underline{\ }xy:}\\
442	\texttt{CALL MPI\underline{\ }ALLTOALL( f\underline{\ }inv(nys\underline{\ }x,nzb\underline{\ }x,0), \hspace{1em}sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\
443	\texttt{\hspace{9.5em}work(1,nzb\underline{\ }y, nxl\underline{\ }y,0), sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\
444	\texttt{\hspace{9.5em}comm1dy, ierr )}\\
445	\ \\
446	\item<3-> The data resorting before and after the calls of MPI\_ALLTOALL is highly optimized to account for the different processor architectures and even allows for overlapping communication and calculation.
447	\end{itemize}
448	\end{column}
449	\end{columns}
450	\end{frame}
451
452	% Folie 12
453	\begin{frame}
454	\frametitle{Parallel I/O}
455	\scriptsize
456	\vspace{-2mm}
457	\begin{columns}[T]
458	\begin{column}{1.1\textwidth}
459	\begin{itemize}
460	\item<1-> PALM writes and reads some of the input/output files in parallel, i.e. each processor writes/reads his own file. \textbf{Each file then has a different name!}\\
461	\ \\
462	\textbf{Example:} binary files for restart are written into a subdirectory of the PALM working directory:\\
463	\quad \texttt{BINOUT/\_0000}\\
464	\quad \texttt{BINOUT/\_0001}\\
465	\quad $\vdots$
466	\item<2-> These files can be handled (copied) by \texttt{mrun} using the file attribute \texttt{pe} in the configuration file:\\
467	\texttt{BINOUT out:loc:pe restart \~{}/palm/current\underline{\ }version/JOBS/\$fname/RESTART \underline{\ }d3d}\\
468	\ \\
469	\onslide<3->In this case, filenames are interpreted as directory names.
470	An \texttt{mrun} call using option\\
471	''\texttt{-d example\underline{\ }cbl -r restart}'' will copy the local \textbf{\underline{directory}} \texttt{BINOUT} to the \textbf{\underline{directory}} \texttt{.../RESTART/example\underline{\ }cbl\underline{\ }d3d} .
472	\end{itemize}
473	\onslide<4-> \textbf{General comment:}
474	\begin{itemize}
475	\item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-option \texttt{-w})
476	\end{itemize}
477	\end{column}
478	\end{columns}
479	\end{frame}
480
481
482
483	%Folie 13
484	\begin{frame}
485	\frametitle{PALM Parallel I/O for 2D/3D Data}
486	\footnotesize
487	\begin{itemize}
488	\item<1-> 2D- and 3D-data output is also written in parallel by the processors (2D: by default, 3D: generally).
489	\item<2-> Because the graphics software (\texttt{ncview}, \texttt{ncl}, \texttt{ferret}, etc.) expect the data to be in one file, these output files have to be merged to one single file after PALM has finished.\\
490	\ \\
491	This is done within the job by calling the utility program \texttt{combine\underline{\ }plot\underline{\ }fields.x} after PALM has successfully finished.
492	\item<3-> \texttt{combine\underline{\ }plot\underline{\ }fields.x} is automatically executed by \texttt{mrun}.
493	\item<4-> The executable \texttt{combine\underline{\ }plot\underline{\ }fields.x} is created during the installation process by the command\\
494	\ \\
495	\quad \texttt{mbuild -u -h <host identifier>}
496	\end{itemize}
497	\end{frame}
498
499	%Folie 14
500	\begin{frame}
501	\frametitle{PALM Parallel I/O for 2D/3D Data with netCDF4/HDF5}
502	\footnotesize
503	\begin{itemize}
504	\item<1-> The Cray XC30 of HLRN-III allows direct parallel I/O to a netCDF file
505	\vspace{2mm}
506	\item<2-> modules \texttt{cray\_hdf5\_parallel} and \texttt{cray\_netcdf\_hdf5parallel} have to be loaded
507	\vspace{2mm}
508	\item<3-> cpp-switches \texttt{-D\_\_netcdf}, \texttt{-D\_\_netcdf4}, \texttt{-D\_\_netcdf4\_parallel} have to be set
509	\vspace{2mm}
510	\item<4-> Both is done in the default HLRN-III block of the configuration file (\texttt{lccrayh})
511	\vspace{2mm}
512	\item<5-> \texttt{d3par}-parameter \texttt{netcdf\_data\_format=5} has to be set in the parameter file
513	\vspace{2mm}
514	\item<6-> \texttt{combine\_plot\_fields.x} is not required in this case
515	\end{itemize}
516	\end{frame}
517
518	%Folie 15
519	\begin{frame}
520	\frametitle{Performance Examples (I)}
521	\begin{itemize}
522	\item Simulation using 1536 * 768 * 242 grid points ($\sim$ 60 GByte)
523	\end{itemize}
524	\includegraphics[scale=0.28]{parallelization_figures/perf_left.png}
525	\includegraphics[scale=0.28]{parallelization_figures/perf_right.png}
526	\includegraphics[scale=0.28]{parallelization_figures/legende.png} \\
527	\scriptsize
528	\begin{columns}[T]
529	\begin{column}{0.18\textwidth}
530	\end{column}
531	\begin{column}{0.4\textwidth}
532	IBM-Regatta, HLRN, Hannover\\
533	(1D domain decomposition)
534	\end{column}
535	\begin{column}{0.4\textwidth}
536	Sun Fire X4600, Tokyo Institute of Technology\\
537	(2D domain decomposition)
538	\end{column}
539	\begin{column}{0.2\textwidth}
540	\end{column}
541
542	\end{columns}
543
544	\end{frame}
545
546	%Folie 16
547	\begin{frame}
548	\frametitle{Performance Examples (II)}
549	\begin{itemize}
550	\item Simulation with $2048^3$ grid points ($\sim$ 2 TByte memory)
551	\end{itemize}
552	\begin{columns}[T]
553	\begin{column}{0.5\textwidth}
554	\includegraphics[scale=0.25]{parallelization_figures/perf_3.png} \\
555	\scriptsize
556	\quad SGI-ICE2, HLRN-II, Hannover\\
557	\quad (2D-domain decomposition)
558	\end{column}
559	\begin{column}{0.5\textwidth}
560	\vspace{35mm}
561	\onslide<2-> largest simulation feasible on that system:\\
562	\ \\
563	$4096^3$ grid points
564	\end{column}
565	\end{columns}
566	\end{frame}
567
568	%Folie 17
569	\begin{frame}
570	\frametitle{Performance Examples (III)}
571	\begin{itemize}
572	\item Simulation with $2160^3$ grid points ($\sim$ 2 TByte memory)
573	\end{itemize}
574	\begin{columns}[T]
575	\begin{column}{0.5\textwidth}
576	\includegraphics[scale=0.3]{parallelization_figures/perf_4.png} \\
577	\scriptsize
578	\quad Cray-XC30, HLRN-III, Hannover\\
579	\quad (2D-domain decomposition)
580	\end{column}
581	\begin{column}{0.5\textwidth}
582	\vspace{35mm}
583	\onslide<2-> currently largest simulation feasible on that system:\\
584	\ \\
585	$5600^3$ grid points
586	\end{column}
587	\end{columns}
588	\end{frame}
589
590	\end{document}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |