Institute of Meteorology and Climatology, Leibniz Universität Hannover
29 | \selectlanguage{english} |
last update: \today
PALM Seminar
Parallelization
PALM group
50 | |
58 | \section{Parallelization} |
59 | \subsection{Parallelization} |
60 | |
61 | % Folie 2 |
62 | \begin{frame} |
63 | \frametitle{Basics of Parallelization} |
64 | \tikzstyle{yellow} = [rectangle, fill=yellow!20, text width=0.4\textwidth, font=\tiny] |
65 | \scriptsize |
66 | \textbf{Parallelization:} |
67 | \begin{itemize} |
68 | \item<2-> All processor elements (PE, core) are carrying out the same program code (SIMD). |
69 | \item<3-> Each PE of a parallel computer operates on a different set of data. |
70 | \end{itemize} |
71 | |
72 | \ \\ |
73 | \onslide<4->\textbf{Realization:} |
74 | \begin{columns}[T] |
75 | \begin{column}{0.45\textwidth} |
76 | \onslide<5->each PE solves the equations for a different subdomain of the total domain |
77 | \begin{center} |
78 | \includegraphics[width=0.3\textwidth]{parallelization_figures/subdomain_folie2.png} |
79 | \end{center} |
80 | \onslide<7->each PE only knows the variable values from its subdomain, communication / data exchange between PEs is necessary\\ |
81 | \onslide<9->\textbf{message passing model (MPI)} |
82 | \end{column} |
83 | \begin{column}{0.45\textwidth} |
84 | \onslide<6->program loops are parallelized, i.e. each processor solves for a subset of the total index range |
85 | \begin{center} |
86 | \begin{tikzpicture}[auto, node distance=0] |
87 | \node [yellow] (1) {% |
88 | \texttt{!\$OMP DO}\\ |
89 | \texttt{DO i = 1, 100}\\ |
90 | \quad $\vdots$\\ |
91 | \texttt{ENDDO}}; |
92 | \end{tikzpicture} |
93 | \begin{tikzpicture}[auto, node distance=0] |
94 | \node [yellow] (2) {% |
95 | \texttt{!\$acc kernels}\\ |
96 | \texttt{DO i = 1, 100}\\ |
97 | \quad $\vdots$\\ |
98 | \texttt{ENDDO}}; |
99 | \end{tikzpicture} |
100 | \end{center} |
101 | \vspace{-1mm} |
102 | \onslide<8-> parallelization can easily be done by the compiler, if all PEs have access to all variables (shared memory)\\ |
103 | \onslide<10-> \textbf{shared memory model (OpenMP)} |
104 | \onslide<10-> \textbf{accelerator model (OpenACC)} |
105 | \end{column} |
106 | \end{columns} |
107 | \end{frame} |
108 | |
109 | % Folie 3 |
110 | \begin{frame} |
111 | \frametitle{Basic Architectures of Massively Parallel Computers} |
112 | \tikzstyle{info} = [rectangle, text width=0.25\textwidth, font=\scriptsize] |
113 | |
114 | |
115 | \begin{center} |
116 | \begin{tikzpicture} |
117 | |
118 | \node (center) at (0,1) {}; |
119 | \onslide<2-> \node (Network) at (-3.5,1) [draw, ellipse,fill=green!20] {Network}; |
120 | \node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-XC30)}; |
121 | \onslide<3-> \node (add_mem) at (3.5,1) [rectangle, draw] {adressable memory}; |
122 | \node (sha_mem) at (3.5,-1) [text width=0.35\textwidth] {\footnotesize \textbf{shared} memory\\(SGI-Altix, multicore PCs)}; |
123 | \onslide<7-> \node (MPI) at (-3.5,-3) [ellipse,fill=yellow!90] {MPI}; |
124 | \onslide<8-> \node (OpenMP) at (3.5,-3) [ellipse,fill=yellow!90, text width=0.13\textwidth] {\footnotesize OpenMP OpenACC}; |
125 | \onslide<6-> \node (clustered_systems) at (0,-3) [draw, text width=0.15\textwidth] {clustered systems}; |
126 | \node (cs_info) at (0,-4.2) [text width=0.4\textwidth] {\footnotesize (IBM-Regatta, Linux-Cluster, |
127 | NEC-SX, SGI-ICE, Cray-XC)}; |
128 | |
129 | % Adressable memory node (big) |
130 | |
131 | \onslide<3-> \node (p1) at (2,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
132 | \node (p2) at (2.6,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
133 | \node (p3) at (3.2,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
134 | \node (p4) at (3.8,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
135 | \node (p5) at (4.4,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
136 | \node (p6) at (5,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
137 | |
138 | \draw[-] (3.5,0.7) -- (3.5,0.4); |
139 | \draw[-] (2,0.4) -- (5,0.4); |
140 | \draw[-] (2,0.4) -- (p1); |
141 | \draw[-] (2.6,0.4) -- (p2); |
142 | \draw[-] (3.2,0.4) -- (p3); |
143 | \draw[-] (3.8,0.4) -- (p4); |
144 | \draw[-] (4.4,0.4) -- (p5); |
145 | \draw[-] (5,0.4) -- (p6); |
146 | |
147 | % Adressable memory node (small) |
148 | \onslide<4-> |
149 | |
150 | \node (small_node) at (-2,0.6) [scale=0.2] {% |
151 | \begin{tikzpicture} |
152 | |
153 | \node (add_mem_small) at (3.5,0.9) [ultra thick, rectangle, draw, minimum width=3cm] {}; |
154 | |
155 | \node (p1_small) at (2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
156 | \node (p2_small) at (2.6,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
157 | \node (p3_small) at (3.2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
158 | \node (p4_small) at (3.8,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
159 | \node (p5_small) at (4.4,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
160 | \node (p6_small) at (5,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
161 | |
162 | \draw[-, ultra thick] (add_mem_small.south) -- (3.5,0.4); |
163 | \draw[-, ultra thick] (2,0.4) -- (5,0.4); |
164 | \draw[-, ultra thick] (2,0.4) -- (p1_small); |
165 | \draw[-, ultra thick] (2.6,0.4) -- (p2_small); |
166 | \draw[-, ultra thick] (3.2,0.4) -- (p3_small); |
167 | \draw[-, ultra thick] (3.8,0.4) -- (p4_small); |
168 | \draw[-, ultra thick] (4.4,0.4) -- (p5_small); |
169 | \draw[-, ultra thick] (5,0.4) -- (p6_small); |
170 | |
171 | |
172 | \end{tikzpicture} |
173 | } ; |
174 | |
175 | \draw[->, thick] (1.5,0.2) -- (small_node) ; |
176 | \draw[-] (-2.7,0.75) -- (-2.3,0.725); |
177 | \onslide<5-> |
178 | \node[below=-0.1cm of small_node] (add_info) [scale=0.9] {\scriptsize node}; |
179 | |
180 | % Black Arrows |
181 | \onslide<6-> \draw[->, thick] (-2.5,-1.5) -- (-0.8,-2.2) ; |
182 | \draw[->, thick] (2.5,-1.5) -- (0.8,-2.2) ; |
183 | |
184 | % MPI Arrows |
185 | \onslide<7-> \draw[->, ultra thick, color=yellow] (-3.5,-2.7) -- (-3.5,-1.5) ; |
186 | \draw[->, ultra thick, color=yellow] (-2.9,-3) -- (-1.0,-3.0) ; |
187 | \draw[->, ultra thick, color=yellow] (-3.0,-2.8) -- (1.5,-1.0) ; |
188 | |
189 | % OpenMP Arrows |
190 | \onslide<8-> \draw[->, ultra thick, color=yellow] (3.5,-2.6) -- (3.5,-1.5) ; |
191 | \draw[->, ultra thick, color=yellow] (2.5,-2.8) -- (-2.0,0.1) ; |
192 | |
193 | % Network decorations |
194 | \onslide<2-> \node (pr1) at (-4.6,0.7) [draw,circle,scale=0.5] {}; |
195 | \node (mem1) at (-4.6,0.45) [draw,rectangle,scale=0.5] {}; |
196 | \draw[-] (-4.5,0.9) -- (pr1); |
197 | \draw[-] (mem1) -- (pr1); |
198 | \draw[-] ([xshift=0.02cm]pr1.east) -- ([xshift=0.3cm, yshift=-0.2cm]pr1.east); |
199 | \draw[-] ([xshift=0.02cm]mem1.east) -- ([xshift=0.3cm, yshift=-0.2cm]mem1.east); |
200 | \node at (-3.7,0.45) {\tiny processor}; |
201 | \node at (-3.3,0.25) {\tiny adressable memory}; |
202 | |
203 | \node (pr2) at (-4.6,1.3) [draw,circle,scale=0.5] {}; |
204 | \node (mem2) at (-4.6,1.55) [draw,rectangle,scale=0.5] {}; |
205 | \draw[-] (-4.5,1.1) -- (pr2); |
206 | \draw[-] (mem2) -- (pr2); |
207 | \node (pr3) at (-3.9,1.5) [draw,circle,scale=0.5] {}; |
208 | \node (mem3) at (-3.9,1.75) [draw,rectangle,scale=0.5] {}; |
209 | \draw[-] (-3.8,1.3) -- (pr3); |
210 | \draw[-] (mem3) -- (pr3); |
211 | \node (pr4) at (-4.9,0.95) [draw,circle,scale=0.5] {}; |
212 | \node (mem4) at (-4.9,0.7) [draw,rectangle,scale=0.5] {}; |
213 | \draw[-] (-4.55,1.0) -- (pr4); |
214 | \draw[-] (mem4) -- (pr4); |
215 | \node (pr5) at (-3.0,1.5) [draw,circle,scale=0.5] {}; |
216 | \node (mem5) at (-3.0,1.75) [draw,rectangle,scale=0.5] {}; |
217 | \draw[-] (-3.08,1.3) -- (pr5); |
218 | \draw[-] (mem5) -- (pr5); |
219 | \node (pr6) at (-2.2,1.1) [draw,circle,scale=0.5] {}; |
220 | \node (mem6) at (-2.2,1.35) [draw,rectangle,scale=0.5] {}; |
221 | \draw[-] (-2.45,1.0) -- (pr6); |
222 | \draw[-] (mem6) -- (pr6); |
223 | |
224 | \onslide<1-> |
225 | \end{tikzpicture} |
226 | \end{center} |
227 | |
228 | \end{frame} |
229 | |
230 | % Folie 4 |
231 | \begin{frame} |
232 | \frametitle{PALM Parallelization Model} |
233 | \scriptsize |
234 | \onslide<2-> \textbf{General demands for a parallelized program:} |
235 | \begin{itemize} |
236 | \item<3-> Load balancing |
237 | \item<4-> Small communication overhead |
238 | \item<5-> Scalability (up to large numbers of processors) |
239 | \end{itemize} |
240 | \vspace{2mm} |
241 | \onslide<6-> \textbf{The basic parallelization method used for PALM is a 2D-domain decomposition along $x$ and $y$:}\\ |
242 | \begin{columns}[T] |
243 | \begin{column}{0.3\textwidth} |
244 | \onslide<7-> \includegraphics[width=1.0\textwidth]{parallelization_figures/subdomain.png} |
245 | \end{column} |
246 | \begin{column}{0.6\textwidth} |
247 | \ \\ |
248 | \onslide<8-> contiguous data in memory (FORTRAN):\\ |
249 | \ \\ |
250 | \ \\ |
251 | \vspace*{-0.05cm} |
252 | \textcolor{blue}{columns of i}\\ |
253 | \vspace*{-0.04cm} |
254 | \textcolor{red}{no contiguous data at all}\\ |
255 | \vspace*{0.17cm} |
256 | \onslide<9-> \textcolor{blue}{columns of k}\\ |
257 | \vspace*{-0.04cm} |
258 | \textcolor{red}{planes of k,j (all data contiguous)} |
259 | \end{column} |
260 | \end{columns} |
261 | \vspace{2mm} |
262 | \begin{itemize} |
263 | \item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used. |
264 | \vspace{2mm} |
265 | \item<11-> Message passing is realized using MPI. |
266 | \vspace{2mm} |
267 | \item<12-> OpenMP parallelization as well as mixed usage of OpenMP and |
268 | MPI is realized. |
269 | \end{itemize} |
270 | \end{frame} |
271 | |
272 | % Folie 5 |
273 | \begin{frame} |
274 | \frametitle{Implications of Decomposition} |
275 | \scriptsize |
276 | \begin{columns}[T] |
277 | \begin{column}{0.5\textwidth} |
278 | \begin{itemize} |
279 | \item<2-> Central finite differences cause \textcolor{red}{local data dependencies}\\ |
280 | \ \\ |
281 | solution: introduction of \textcolor{red}{ghost points} |
282 | \vspace{5mm} |
283 | |
284 | \begin{flushright} |
285 | \onslide<3-> $\left. \dfrac{\partial \psi}{\partial x} \right|_i = \dfrac{\psi_{i+1} - \psi_{i-1}}{2 \Delta x}$ |
286 | \end{flushright} |
287 | \vspace{10mm} |
288 | \item<4-> FFT and linear equation solver cause \textcolor{red}{non-local data dependencies}\\ |
289 | \ \\ |
290 | solution: transposition of 3D-arrays |
291 | \end{itemize} |
292 | \end{column} |
293 | \begin{column}{0.6\textwidth} |
294 | \begin{center} |
295 | \vspace{-5mm} |
296 | \onslide<3-> \includegraphics[width=0.7\textwidth]{parallelization_figures/ghost_points.png} |
297 | \vspace{4mm} |
298 | \onslide<5-> \includegraphics[width=0.8\textwidth]{parallelization_figures/fft.png} \end{center} |
299 | \vspace{-4mm} |
300 | \textbf{Example: transpositions for solving the Poisson\\ \hspace{4.1em}equation} |
301 | \end{column} |
302 | \end{columns} |
303 | \end{frame} |
304 | |
305 | % Folie 6 |
306 | \begin{frame} |
307 | \frametitle{How to Use the Parallelized Version of PALM} |
308 | \scriptsize |
309 | \begin{columns}[T] |
310 | \begin{column}{1.12\textwidth} |
311 | \begin{itemize} |
312 | \item<1-> The parallel version of PALM is switched on by \texttt{mrun}-option ''\texttt{-K parallel}''. Additionally, the number of required processors and the number of tasks per node (number of PEs to be used on one node) have to be provided:\\ |
313 | \quad \texttt{mrun ... -K parallel -X64 -T8 ...} |
314 | \item<2-> From an accounting point of view, it is always most efficient to use all PEs of a node (e.g. \texttt{-T8}) (in case of a ''non-shared'' usage of nodes). |
315 | \item<3-> If a normal unix-kernel operating system (not a micro-kernel) is running on each CPU, then there migth be a speed-up of the code, if 1-2 PEs less than the total number of PEs on the node are used. |
316 | \item<4-> On machines with a comparably slow network, a 1D-decomposition (along $x$) should be used, because then only two transpositions have to be carried out by the pressure solver. A 1D-decomposition is automatically used for NEC-machines (e.g. \texttt{-h necriam}). The virtual processor grid to be used can be set manually by d3par-parameters \texttt{npex} and \texttt{npey}. |
317 | \item<5-> Using the Open-MP parallelization does not yield any advantage over using a pure domain decomposition with MPI (contrary to expectations, it mostly slows down the computational speed), but this may change on cluster systems for very large number of processors ($>$10000?) or with Intel-Xeon-Phi accelerator boards.\\ |
318 | \end{itemize} |
319 | \begin{center} |
320 | \vspace{-3mm} |
321 | \onslide<4-> \includegraphics[width=0.13\textwidth]{parallelization_figures/folie_6.png} |
322 | \end{center} |
323 | \end{column} |
324 | \end{columns} |
325 | \end{frame} |
326 | |
327 | % Folie 7 |
328 | \begin{frame} |
329 | \frametitle{MPI Communication} |
330 | \scriptsize |
331 | \begin{columns}[T] |
332 | \begin{column}{1.12\textwidth} |
333 | \begin{itemize} |
334 | \item<1-> MPI (message passing interface) is a portable interface for communication between PEs (FORTRAN or C library). |
335 | \vspace{2mm} |
336 | \item<2-> MPI on the Cray-XC30 of HLRN-III is provided with module \texttt{PrgEnv-cray} which is loaded by default. |
337 | \vspace{2mm} |
338 | \item<3-> All MPI calls must be within\\ |
339 | \quad \texttt{CALL MPI\_INIT( ierror )}\\ |
340 | \quad $\vdots$\\ |
341 | \quad \texttt{CALL MPI\_FINALIZE( ierror )}\\ |
342 | \end{itemize} |
343 | |
344 | \end{column} |
345 | \end{columns} |
346 | \end{frame} |
347 | |
348 | % Folie 8 |
349 | \begin{frame} |
350 | \frametitle{Communication Within PALM} |
351 | \small |
352 | \begin{itemize} |
353 | \item<1-> MPI calls within PALM are available when using the \texttt{mrun}-option ''\texttt{-K parallel}''. |
354 | \item<2-> Communication is needed for |
355 | \begin{itemize} |
356 | \footnotesize |
357 | \item<2-> exchange of ghost points |
358 | \item<3-> transpositions (FFT-poisson-solver) |
359 | \item<4-> calculating global sums (e.g. for calculating horizontal averages) |
360 | \end{itemize} |
361 | \item<5-> Additional MPI calls are required to define the so-called virtual processor grid and to define special data types needed for more comfortable exchange of data. |
362 | \end{itemize} |
363 | \end{frame} |
364 | |
365 | % Folie 9 |
366 | \begin{frame} |
367 | \frametitle{Virtual Processor Grid Used in PALM} |
368 | \scriptsize |
369 | \vspace{2mm} |
370 | The processor grid and special data types are defined in file \texttt{init\_pegrid.f90}\\ |
371 | \ \\ |
372 | \begin{itemize} |
373 | \item<2-> PALM uses a two-dimensional virtual processor grid (in case of a 1D-decomposition, it has only one element along $y$). It is defined by a so called communicator (here: \texttt{comm2d}):\\ |
374 | \tiny |
375 | \vspace{1.5mm} |
376 | \quad \texttt{ndim = 2}\\ |
377 | \quad \texttt{pdims(1) = npex \quad ! \# of processors along x}\\ |
378 | \quad \texttt{pdims(2) = npey \quad ! \# of processors along y}\\ |
379 | \quad \texttt{cyclic(1) = .TRUE.}\\ |
380 | \quad \texttt{cyclic(2) = .TRUE.}\\ |
381 | \ \\ |
382 | \quad \texttt{CALL MPI\underline{\ }CART\underline{\ }CREATE( MPI\underline{\ }COMM\underline{\ }WORLD, ndim, pdims, cyclic, reorder, \&}\\ |
383 | \quad \texttt{\hspace{10.5em} \textcolor{blue}{comm2d}, ierr )} |
384 | \scriptsize |
385 | \vspace{4mm} |
386 | \item<3-> The processor number (id) with respect to this processor grid, \texttt{myid}, is given by:\\ |
387 | \tiny |
388 | \vspace{1.5mm} |
389 | \quad \texttt{CALL MPI\underline{\ }COMM\underline{\ }RANK( comm2d, \textcolor{blue}{myid}, ierr )} |
390 | \scriptsize |
391 | \vspace{4mm} |
392 | \item<4-> The ids of the neighbouring PEs are determined by:\\ |
393 | \tiny |
394 | \vspace{1.5mm} |
395 | \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 0, 1, \textcolor{blue}{pleft}, \textcolor{blue}{pright}, ierr )}\\ |
396 | \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 1, 1, \textcolor{blue}{psouth}, \textcolor{blue}{pnorth}, ierr )}\\ |
397 | \end{itemize} |
398 | \end{frame} |
399 | |
400 | % Folie 10 |
401 | \begin{frame} |
402 | \frametitle{Exchange of ghost points} |
403 | \scriptsize |
404 | \begin{itemize} |
405 | \item<1-> Ghost points are stored in additional array elements added at the horizontal boundaries of the subdomains, e.g.\\ |
406 | \tiny |
407 | \vspace{2mm} |
408 | \quad \texttt{u(:,:,nxl\textcolor{blue}{-nbgp}), u(:,:,nxr\textcolor{blue}{+nbgp}) ! left and right boundary}\\ |
409 | \quad \texttt{u(:,nys\textcolor{blue}{-nbgp},:), u(:,nyn\textcolor{blue}{+nbgp},:) ! south and north boundary}\\ |
410 | \vspace{1mm} |
411 | \scriptsize The actual code uses \texttt{\textcolor{blue}{nxlg}=nxl\textcolor{blue}{-nbgp}}, etc...\\ |
412 | \vspace{2mm} |
413 | \item<2-> \scriptsize The exchange of ghost points is done in file \texttt{exchange\underline{\ }horiz.f90}\\ |
414 | \textbf{\underline{Simplified} example:} synchroneous exchange of ghost points along $x$ ($yz$-planes, send left, receive right plane):\\ |
415 | \tiny |
416 | \vspace{2mm} |
417 | \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,\textcolor{blue}{nysg},nxl), ngp\underline{\ }yz, MPI\underline{\ }REAL, pleft, 0,}\\ |
418 | \quad \texttt{\hspace{9.5em}ar(nzb,\textcolor{blue}{nysg},nxr+1), ngp\underline{\ }yz, MPI\underline{\ }REAL, pright, 0,}\\ |
419 | \quad \texttt{\hspace{9.5em}comm2d, status, ierr )}\\ |
420 | \vspace{2mm} |
421 | \item<3-> \scriptsize In the real code special MPI data types (vectors) are defined for exchange of $yz$/$xz$-planes for performance reasons and because array elements to be exchanged are not consecutively stored in memory for $xz$-planes:\\ |
422 | \tiny |
423 | \vspace{2mm} |
424 | \quad \texttt{ngp\underline{\ }yz(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * \textcolor{blue}{nbgp} )}\\ |
425 | \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }VECTOR( \textcolor{blue}{nbgp}, ngp\underline{\ }yz(0), ngp\underline{\ }yz(0), MPI\underline{\ }REAL, type\underline{\ }yz(0), ierr )}\\ |
426 | \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ }yz(0), ierr ) ! see file init\underline{\ }pegrid.f90}\\ |
427 | \ \\ |
428 | \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,\textcolor{blue}{nysg},nxl), 1, type\underline{\ }yz(grid\underline{\ }level), pleft, 0, ...}\\ |
429 | \end{itemize} |
430 | \end{frame} |
431 | |
432 | % Folie 11 |
433 | \begin{frame} |
434 | \frametitle{Transpositions} |
435 | \footnotesize |
436 | \begin{columns}[T] |
437 | \begin{column}{1.05\textwidth} |
438 | \begin{itemize} |
439 | \item<1-> Transpositions can be found in file \texttt{transpose.f90} (several subroutines for 1D- or 2D-decompositions; they are called mainly from the FFT pressure solver, see \texttt{poisfft.f90}.\\ |
440 | \ \\ |
441 | \item<2-> The following example is for a transposition from $x$ to $y$, i.e. for the input array all data elements along $x$ reside on the same PE, while after the transposition, all elements along $y$ are on the same PE:\\ |
442 | \ \\ |
443 | \tiny |
444 | \texttt{!}\\ |
445 | \texttt{!-- in SUBROUTINE transpose\underline{\ }xy:}\\ |
446 | \texttt{CALL MPI\underline{\ }ALLTOALL( f\underline{\ }inv(nys\underline{\ }x,nzb\underline{\ }x,0), \hspace{1em}sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\ |
447 | \texttt{\hspace{9.5em}work(1,nzb\underline{\ }y, nxl\underline{\ }y,0), sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\ |
448 | \texttt{\hspace{9.5em}comm1dy, ierr )}\\ |
449 | \ \\ |
450 | \item<3-> The data resorting before and after the calls of MPI\_ALLTOALL is highly optimized to account for the different processor architectures and even allows for overlapping communication and calculation. |
451 | \end{itemize} |
452 | \end{column} |
453 | \end{columns} |
454 | \end{frame} |
455 | |
456 | % Folie 12 |
457 | \begin{frame} |
458 | \frametitle{Parallel I/O} |
459 | \scriptsize |
460 | \vspace{-2mm} |
461 | \begin{columns}[T] |
462 | \begin{column}{1.1\textwidth} |
463 | \begin{itemize} |
464 | \item<1-> PALM writes and reads some of the input/output files in parallel, i.e. each processor writes/reads his own file. \textbf{Each file then has a different name!}\\ |
465 | \ \\ |
466 | \textbf{Example:} binary files for restart are written into a subdirectory of the PALM working directory:\\ |
467 | \quad \texttt{BINOUT/\_0000}\\ |
468 | \quad \texttt{BINOUT/\_0001}\\ |
469 | \quad $\vdots$ |
470 | \item<2-> These files can be handled (copied) by \texttt{mrun} using the file attribute \texttt{pe} in the configuration file:\\ |
471 | \texttt{BINOUT out:loc:pe restart \~{}/palm/current\underline{\ }version/JOBS/\$fname/RESTART \underline{\ }d3d}\\ |
472 | \ \\ |
473 | \onslide<3->In this case, filenames are interpreted as directory names. |
474 | An \texttt{mrun} call using option\\ |
475 | ''\texttt{-d example\underline{\ }cbl -r restart}'' will copy the local \textbf{\underline{directory}} \texttt{BINOUT} to the \textbf{\underline{directory}} \texttt{.../RESTART/example\underline{\ }cbl\underline{\ }d3d} . |
476 | \end{itemize} |
477 | \onslide<4-> \textbf{General comment:} |
478 | \begin{itemize} |
479 | \item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-option \texttt{-w}) |
480 | \end{itemize} |
481 | \end{column} |
482 | \end{columns} |
483 | \end{frame} |
484 | |
485 | |
486 | |
487 | %Folie 13 |
488 | \begin{frame} |
489 | \frametitle{PALM Parallel I/O for 2D/3D Data} |
490 | \footnotesize |
491 | \begin{itemize} |
492 | \item<1-> 2D- and 3D-data output is also written in parallel by the processors (2D: by default, 3D: generally). |
493 | \item<2-> Because the graphics software (\texttt{ncview}, \texttt{ncl}, \texttt{ferret}, etc.) expect the data to be in one file, these output files have to be merged to one single file after PALM has finished.\\ |
494 | \ \\ |
495 | This is done within the job by calling the utility program \texttt{combine\underline{\ }plot\underline{\ }fields.x} after PALM has successfully finished. |
496 | \item<3-> \texttt{combine\underline{\ }plot\underline{\ }fields.x} is automatically executed by \texttt{mrun}. |
497 | \item<4-> The executable \texttt{combine\underline{\ }plot\underline{\ }fields.x} is created during the installation process by the command\\ |
498 | \ \\ |
499 | \quad \texttt{mbuild -u -h <host identifier>} |
500 | \end{itemize} |
501 | \end{frame} |
502 | |
503 | %Folie 14 |
504 | \begin{frame} |
505 | \frametitle{PALM Parallel I/O for 2D/3D Data with netCDF4/HDF5} |
506 | \footnotesize |
507 | \begin{itemize} |
508 | \item<1-> The Cray XC30 of HLRN-III allows direct parallel I/O to a netCDF file |
509 | \vspace{2mm} |
510 | \item<2-> modules \texttt{cray\_hdf5\_parallel} and \texttt{cray\_netcdf\_hdf5parallel} have to be loaded |
511 | \vspace{2mm} |
512 | \item<3-> cpp-switches \texttt{-D\_\_netcdf}, \texttt{-D\_\_netcdf4}, \texttt{-D\_\_netcdf4\_parallel} have to be set |
513 | \vspace{2mm} |
514 | \item<4-> Both is done in the default HLRN-III block of the configuration file (\texttt{lccrayh}) |
515 | \vspace{2mm} |
516 | \item<5-> \texttt{d3par}-parameter \texttt{netcdf\_data\_format=5} has to be set in the parameter file |
517 | \vspace{2mm} |
518 | \item<6-> \texttt{combine\_plot\_fields.x} is not required in this case |
519 | \end{itemize} |
520 | \end{frame} |
521 | |
522 | %Folie 15 |
523 | \begin{frame} |
524 | \frametitle{Performance Examples (I)} |
525 | \begin{itemize} |
526 | \item Simulation using 1536 * 768 * 242 grid points ($\sim$ 60 GByte) |
527 | \end{itemize} |
528 | \includegraphics[scale=0.28]{parallelization_figures/perf_left.png} |
529 | \includegraphics[scale=0.28]{parallelization_figures/perf_right.png} |
530 | \includegraphics[scale=0.28]{parallelization_figures/legende.png} \\ |
531 | \scriptsize |
532 | \begin{columns}[T] |
533 | \begin{column}{0.18\textwidth} |
534 | \end{column} |
535 | \begin{column}{0.4\textwidth} |
536 | IBM-Regatta, HLRN, Hannover\\ |
537 | (1D domain decomposition) |
538 | \end{column} |
539 | \begin{column}{0.4\textwidth} |
540 | Sun Fire X4600, Tokyo Institute of Technology\\ |
541 | (2D domain decomposition) |
542 | \end{column} |
543 | \begin{column}{0.2\textwidth} |
544 | \end{column} |
545 | |
546 | \end{columns} |
547 | |
548 | \end{frame} |
549 | |
550 | %Folie 16 |
551 | \begin{frame} |
552 | \frametitle{Performance Examples (II)} |
553 | \begin{itemize} |
554 | \item Simulation with $2048^3$ grid points ($\sim$ 2 TByte memory) |
555 | \end{itemize} |
556 | \begin{columns}[T] |
557 | \begin{column}{0.5\textwidth} |
558 | \includegraphics[scale=0.25]{parallelization_figures/perf_3.png} \\ |
559 | \scriptsize |
560 | \quad SGI-ICE2, HLRN-II, Hannover\\ |
561 | \quad (2D-domain decomposition) |
562 | \end{column} |
563 | \begin{column}{0.5\textwidth} |
564 | \vspace{35mm} |
565 | \onslide<2-> largest simulation feasible on that system:\\ |
566 | \ \\ |
567 | $4096^3$ grid points |
568 | \end{column} |
569 | \end{columns} |
570 | \end{frame} |
571 | |
572 | %Folie 17 |
573 | \begin{frame} |
574 | \frametitle{Performance Examples (III)} |
575 | \begin{itemize} |
576 | \item Simulation with $4320^3$ grid points ($\sim$ 13 TByte memory) |
577 | \end{itemize} |
578 | \begin{columns}[T] |
579 | \begin{column}{0.5\textwidth} |
580 | \includegraphics[scale=0.5]{parallelization_figures/perf_4.png} \\ |
581 | \scriptsize |
582 | \quad Cray-XC40, HLRN-III, Hannover\\ |
583 | \quad (2D-domain decomposition) |
584 | \end{column} |
585 | \begin{column}{0.5\textwidth} |
586 | \vspace{35mm} |
587 | \onslide<2-> currently largest simulation feasible on that system:\\ |
588 | \ \\ |
589 | $5600^3$ grid points |
590 | \end{column} |
591 | \end{columns} |
592 | \end{frame} |
593 | |
