[945] | 1 | % $Id: parallelization.tex 973 2012-08-07 16:03:47Z heinze $ |
---|
| 2 | \input{header_tmp.tex} |
---|
| 3 | %\input{../header_lectures.tex} |
---|
| 4 | |
---|
| 5 | \usepackage[utf8]{inputenc} |
---|
| 6 | \usepackage{ngerman} |
---|
| 7 | \usepackage{pgf} |
---|
| 8 | \usetheme{Dresden} |
---|
| 9 | \usepackage{subfigure} |
---|
| 10 | \usepackage{units} |
---|
| 11 | \usepackage{multimedia} |
---|
| 12 | \usepackage{hyperref} |
---|
| 13 | \newcommand{\event}[1]{\newcommand{\eventname}{#1}} |
---|
| 14 | \usepackage{xmpmulti} |
---|
| 15 | \usepackage{tikz} |
---|
| 16 | \usetikzlibrary{shapes,arrows,positioning} |
---|
| 17 | \usetikzlibrary{decorations.markings} %neues paket |
---|
| 18 | \usetikzlibrary{decorations.pathreplacing} %neues paket |
---|
| 19 | \def\Tiny{\fontsize{4pt}{4pt}\selectfont} |
---|
| 20 | \usepackage{amsmath} |
---|
| 21 | \usepackage{amssymb} |
---|
| 22 | \usepackage{multicol} |
---|
| 23 | \usepackage{pdfcomment} |
---|
| 24 | \usepackage{graphicx} |
---|
| 25 | \usepackage{listings} |
---|
| 26 | \lstset{showspaces=false,language=fortran,basicstyle= |
---|
| 27 | \ttfamily,showstringspaces=false,captionpos=b} |
---|
| 28 | |
---|
| 29 | \institute{Institut fÌr Meteorologie und Klimatologie, Leibniz UniversitÀt Hannover} |
---|
| 30 | \date{last update: \today} |
---|
| 31 | \event{PALM Seminar} |
---|
| 32 | \setbeamertemplate{navigation symbols}{} |
---|
| 33 | |
---|
| 34 | \setbeamertemplate{footline} |
---|
| 35 | { |
---|
| 36 | \begin{beamercolorbox}[rightskip=-0.1cm]& |
---|
| 37 | {\includegraphics[height=0.65cm]{imuk_logo.pdf}\hfill \includegraphics[height=0.65cm]{luh_logo.pdf}} |
---|
| 38 | \end{beamercolorbox} |
---|
| 39 | \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex, |
---|
| 40 | leftskip=.3cm,rightskip=0.3cm plus1fil]{title in head/foot} |
---|
| 41 | {\leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor} \hfill \eventname \hfill \insertframenumber \; / \inserttotalframenumber} |
---|
| 42 | \end{beamercolorbox} |
---|
| 43 | \begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot} |
---|
| 44 | \end{beamercolorbox} |
---|
| 45 | } |
---|
| 46 | %\logo{\includegraphics[width=0.3\textwidth]{luhimuk_logo.pdf}} |
---|
| 47 | |
---|
| 48 | \title[Parallelization]{Parallelization} |
---|
| 49 | \author{Siegfried Raasch} |
---|
| 50 | |
---|
| 51 | \begin{document} |
---|
| 52 | |
---|
| 53 | % Folie 1 |
---|
| 54 | \begin{frame} |
---|
| 55 | \titlepage |
---|
| 56 | \end{frame} |
---|
| 57 | |
---|
| 58 | \section{Parallelization} |
---|
| 59 | \subsection{Parallelization} |
---|
| 60 | |
---|
| 61 | % Folie 2 |
---|
| 62 | \begin{frame} |
---|
| 63 | \frametitle{Basics of Parallelization} |
---|
| 64 | \tikzstyle{yellow} = [rectangle, fill=yellow!20, text width=0.6\textwidth, font=\scriptsize] |
---|
| 65 | \scriptsize |
---|
| 66 | \textbf{Parallelization:} |
---|
| 67 | \begin{itemize} |
---|
| 68 | \item<2-> All processor elements (PE, core) are carrying out the same program (SIMD). |
---|
| 69 | \item<3-> Each PE of a parallel computer operates on a different set of data. |
---|
| 70 | \end{itemize} |
---|
| 71 | |
---|
| 72 | \ \\ |
---|
| 73 | \onslide<4->\textbf{Realization:} |
---|
| 74 | \begin{columns}[T] |
---|
| 75 | \begin{column}{0.45\textwidth} |
---|
| 76 | \onslide<5->each PE solves the equations for a different subdomain of the total domain |
---|
| 77 | \begin{center} |
---|
| 78 | \includegraphics[width=0.5\textwidth]{parallelization_figures/subdomain.png} |
---|
| 79 | \end{center} |
---|
| 80 | \onslide<7->each PE only knows the variable values from its subdomain, communication / data exchange between PEs is necessary\\ |
---|
| 81 | \onslide<9->\textbf{message passing model (MPI)} |
---|
| 82 | \end{column} |
---|
| 83 | \begin{column}{0.45\textwidth} |
---|
| 84 | \onslide<6->program loops are parallelized, i.e. each processor solves for a subset of the total index range |
---|
| 85 | \begin{center} |
---|
| 86 | \begin{tikzpicture}[auto, node distance=0] |
---|
| 87 | \node [yellow] (1) {% |
---|
| 88 | \texttt{!\$OMP DO}\\ |
---|
| 89 | \texttt{\quad \quad DO i = 1, 100}\\ |
---|
| 90 | \quad \quad \quad $\vdots$\\ |
---|
| 91 | \texttt{\quad \quad ENDDO}}; |
---|
| 92 | \end{tikzpicture} |
---|
| 93 | \end{center} |
---|
| 94 | \onslide<8-> parallelization can easily be done by the compiler, if all PEs have access to all variables (shared memory)\\ |
---|
| 95 | \onslide<10-> \textbf{shared memory model (OpenMP)} |
---|
| 96 | \end{column} |
---|
| 97 | \end{columns} |
---|
| 98 | \end{frame} |
---|
| 99 | |
---|
| 100 | % Folie 3 |
---|
| 101 | \begin{frame} |
---|
| 102 | \frametitle{Basic Architectures of Massively Parallel Computers} |
---|
| 103 | \tikzstyle{info} = [rectangle, text width=0.25\textwidth, font=\scriptsize] |
---|
| 104 | |
---|
| 105 | |
---|
| 106 | \begin{center} |
---|
| 107 | \begin{tikzpicture} |
---|
| 108 | |
---|
| 109 | \node (center) at (0,1) {}; |
---|
[973] | 110 | \onslide<2-> \node (Network) at (-3.5,1) [draw, ellipse,fill=green!20] {Network}; |
---|
[945] | 111 | \node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-T3E)}; |
---|
| 112 | \onslide<3-> \node (add_mem) at (3.5,1) [rectangle, draw] {adressable memory}; |
---|
| 113 | \node (sha_mem) at (3.5,-1) [text width=0.35\textwidth] {\footnotesize \textbf{shared} memory\\(SGI-Altix, multicore PCs)}; |
---|
| 114 | \onslide<7-> \node (MPI) at (-3.5,-3) [ellipse,fill=yellow!90] {MPI}; |
---|
| 115 | \onslide<8-> \node (OpenMP) at (3.5,-3) [ellipse,fill=yellow!90] {OpenMP}; |
---|
| 116 | \onslide<6-> \node (clustered_systems) at (0,-3) [draw, text width=0.15\textwidth] {clustered systems}; |
---|
| 117 | \node (cs_info) at (0,-4.2) [text width=0.4\textwidth] {\footnotesize (IBM-Regatta, Linux-Cluster, |
---|
[973] | 118 | NEC-SX, SGI-ICE, Cray-XE6)}; |
---|
[945] | 119 | |
---|
| 120 | % Adressable memory node (big) |
---|
| 121 | |
---|
| 122 | \onslide<3-> \node (p1) at (2,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
---|
| 123 | \node (p2) at (2.6,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
---|
| 124 | \node (p3) at (3.2,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
---|
| 125 | \node (p4) at (3.8,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
---|
| 126 | \node (p5) at (4.4,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
---|
| 127 | \node (p6) at (5,-0.05) [draw,circle, scale=0.9] {\scriptsize p}; |
---|
| 128 | |
---|
| 129 | \draw[-] (3.5,0.7) -- (3.5,0.4); |
---|
| 130 | \draw[-] (2,0.4) -- (5,0.4); |
---|
| 131 | \draw[-] (2,0.4) -- (p1); |
---|
| 132 | \draw[-] (2.6,0.4) -- (p2); |
---|
| 133 | \draw[-] (3.2,0.4) -- (p3); |
---|
| 134 | \draw[-] (3.8,0.4) -- (p4); |
---|
| 135 | \draw[-] (4.4,0.4) -- (p5); |
---|
| 136 | \draw[-] (5,0.4) -- (p6); |
---|
| 137 | |
---|
| 138 | % Adressable memory node (small) |
---|
| 139 | \onslide<4-> |
---|
[973] | 140 | |
---|
| 141 | \node (small_node) at (-2,0.6) [scale=0.2] {% |
---|
| 142 | \begin{tikzpicture} |
---|
| 143 | |
---|
| 144 | \node (add_mem_small) at (3.5,0.9) [ultra thick, rectangle, draw, minimum width=3cm] {}; |
---|
| 145 | |
---|
| 146 | \node (p1_small) at (2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
---|
| 147 | \node (p2_small) at (2.6,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
---|
| 148 | \node (p3_small) at (3.2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
---|
| 149 | \node (p4_small) at (3.8,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
---|
| 150 | \node (p5_small) at (4.4,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
---|
| 151 | \node (p6_small) at (5,-0.05) [ultra thick, draw,circle, scale=0.9] {}; |
---|
[945] | 152 | |
---|
[973] | 153 | \draw[-, ultra thick] (add_mem_small.south) -- (3.5,0.4); |
---|
| 154 | \draw[-, ultra thick] (2,0.4) -- (5,0.4); |
---|
| 155 | \draw[-, ultra thick] (2,0.4) -- (p1_small); |
---|
| 156 | \draw[-, ultra thick] (2.6,0.4) -- (p2_small); |
---|
| 157 | \draw[-, ultra thick] (3.2,0.4) -- (p3_small); |
---|
| 158 | \draw[-, ultra thick] (3.8,0.4) -- (p4_small); |
---|
| 159 | \draw[-, ultra thick] (4.4,0.4) -- (p5_small); |
---|
| 160 | \draw[-, ultra thick] (5,0.4) -- (p6_small); |
---|
| 161 | |
---|
| 162 | |
---|
| 163 | \end{tikzpicture} |
---|
| 164 | } ; |
---|
[945] | 165 | |
---|
[973] | 166 | \draw[->, thick] (1.5,0.2) -- (small_node) ; |
---|
| 167 | \draw[-] (-2.7,0.75) -- (-2.3,0.725); |
---|
[945] | 168 | \onslide<5-> |
---|
[973] | 169 | \node[below=-0.1cm of small_node] (add_info) [scale=0.9] {\scriptsize node}; |
---|
[945] | 170 | |
---|
| 171 | % Black Arrows |
---|
| 172 | \onslide<6-> \draw[->, thick] (-2.5,-1.5) -- (-0.8,-2.2) ; |
---|
| 173 | \draw[->, thick] (2.5,-1.5) -- (0.8,-2.2) ; |
---|
| 174 | |
---|
| 175 | % MPI Arrows |
---|
| 176 | \onslide<7-> \draw[->, ultra thick, color=yellow] (-3.5,-2.7) -- (-3.5,-1.5) ; |
---|
| 177 | \draw[->, ultra thick, color=yellow] (-2.9,-3) -- (-1.0,-3.0) ; |
---|
| 178 | \draw[->, ultra thick, color=yellow] (-3.0,-2.8) -- (1.5,-1.0) ; |
---|
| 179 | |
---|
| 180 | % OpenMP Arrows |
---|
| 181 | \onslide<8-> \draw[->, ultra thick, color=yellow] (3.5,-2.6) -- (3.5,-1.5) ; |
---|
[973] | 182 | \draw[->, ultra thick, color=yellow] (2.5,-2.8) -- (-2.0,0.1) ; |
---|
[945] | 183 | |
---|
| 184 | % Network decorations |
---|
| 185 | \onslide<2-> \node (pr1) at (-4.6,0.7) [draw,circle,scale=0.5] {}; |
---|
| 186 | \node (mem1) at (-4.6,0.45) [draw,rectangle,scale=0.5] {}; |
---|
| 187 | \draw[-] (-4.5,0.9) -- (pr1); |
---|
| 188 | \draw[-] (mem1) -- (pr1); |
---|
| 189 | \draw[-] ([xshift=0.02cm]pr1.east) -- ([xshift=0.3cm, yshift=-0.2cm]pr1.east); |
---|
| 190 | \draw[-] ([xshift=0.02cm]mem1.east) -- ([xshift=0.3cm, yshift=-0.2cm]mem1.east); |
---|
| 191 | \node at (-3.7,0.45) {\tiny processor}; |
---|
| 192 | \node at (-3.3,0.25) {\tiny adressable memory}; |
---|
| 193 | |
---|
| 194 | \node (pr2) at (-4.6,1.3) [draw,circle,scale=0.5] {}; |
---|
| 195 | \node (mem2) at (-4.6,1.55) [draw,rectangle,scale=0.5] {}; |
---|
| 196 | \draw[-] (-4.5,1.1) -- (pr2); |
---|
| 197 | \draw[-] (mem2) -- (pr2); |
---|
| 198 | \node (pr3) at (-3.9,1.5) [draw,circle,scale=0.5] {}; |
---|
| 199 | \node (mem3) at (-3.9,1.75) [draw,rectangle,scale=0.5] {}; |
---|
| 200 | \draw[-] (-3.8,1.3) -- (pr3); |
---|
| 201 | \draw[-] (mem3) -- (pr3); |
---|
| 202 | \node (pr4) at (-4.9,0.95) [draw,circle,scale=0.5] {}; |
---|
| 203 | \node (mem4) at (-4.9,0.7) [draw,rectangle,scale=0.5] {}; |
---|
| 204 | \draw[-] (-4.55,1.0) -- (pr4); |
---|
| 205 | \draw[-] (mem4) -- (pr4); |
---|
| 206 | \node (pr5) at (-3.0,1.5) [draw,circle,scale=0.5] {}; |
---|
| 207 | \node (mem5) at (-3.0,1.75) [draw,rectangle,scale=0.5] {}; |
---|
| 208 | \draw[-] (-3.08,1.3) -- (pr5); |
---|
| 209 | \draw[-] (mem5) -- (pr5); |
---|
| 210 | \node (pr6) at (-2.2,1.1) [draw,circle,scale=0.5] {}; |
---|
| 211 | \node (mem6) at (-2.2,1.35) [draw,rectangle,scale=0.5] {}; |
---|
| 212 | \draw[-] (-2.45,1.0) -- (pr6); |
---|
| 213 | \draw[-] (mem6) -- (pr6); |
---|
[973] | 214 | |
---|
| 215 | \onslide<1-> |
---|
[945] | 216 | \end{tikzpicture} |
---|
| 217 | \end{center} |
---|
| 218 | |
---|
| 219 | \end{frame} |
---|
| 220 | |
---|
| 221 | % Folie 4 |
---|
| 222 | \begin{frame} |
---|
| 223 | \frametitle{PALM Parallelization Model} |
---|
| 224 | \scriptsize |
---|
| 225 | \onslide<2-> \textbf{General demands for a parallelized program:} |
---|
| 226 | \begin{itemize} |
---|
| 227 | \item<3-> Load balancing |
---|
| 228 | \item<4-> Small communication overhead |
---|
| 229 | \item<5-> Scalability (up to large numbers of processors) |
---|
| 230 | \end{itemize} |
---|
| 231 | \vspace{2mm} |
---|
| 232 | \onslide<6-> \textbf{The basic parallelization method used for PALM is a 2D-domain decomposition along $x$ and $y$:}\\ |
---|
| 233 | \begin{columns}[T] |
---|
| 234 | \begin{column}{0.3\textwidth} |
---|
| 235 | \onslide<7-> \includegraphics[width=1.0\textwidth]{parallelization_figures/subdomain.png} |
---|
| 236 | \end{column} |
---|
| 237 | \begin{column}{0.6\textwidth} |
---|
| 238 | \ \\ |
---|
| 239 | \onslide<8-> contiguous data in memory (FORTRAN):\\ |
---|
| 240 | \ \\ |
---|
| 241 | \ \\ |
---|
| 242 | \textcolor{blue}{columns of i}\\ |
---|
| 243 | \textcolor{red}{no contiguous data at all}\\ |
---|
| 244 | \onslide<9-> \textcolor{blue}{columns of k}\\ |
---|
| 245 | \textcolor{red}{planes of k,j (all data contiguous)} |
---|
| 246 | \end{column} |
---|
| 247 | \end{columns} |
---|
| 248 | \vspace{2mm} |
---|
| 249 | \begin{itemize} |
---|
| 250 | \item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used in case of slow networks, but this generally doesn't scale for processor numbers $>$ 256. |
---|
[973] | 251 | \vspace{2mm} |
---|
[945] | 252 | \item<11-> Message passing is realized using MPI. |
---|
[973] | 253 | \vspace{2mm} |
---|
[945] | 254 | \item<12-> OpenMP parallelization as well as mixed usage of OpenMP and |
---|
| 255 | MPI is also possible. (OpenMP tests and optimization is under way) |
---|
| 256 | \end{itemize} |
---|
| 257 | \end{frame} |
---|
| 258 | |
---|
| 259 | % Folie 5 |
---|
| 260 | \begin{frame} |
---|
| 261 | \frametitle{Implications of Decomposition} |
---|
| 262 | \scriptsize |
---|
| 263 | \begin{columns}[T] |
---|
| 264 | \begin{column}{0.5\textwidth} |
---|
| 265 | \begin{itemize} |
---|
| 266 | \item<2-> Central finite differences cause \textcolor{red}{local data dependencies}\\ |
---|
| 267 | \ \\ |
---|
| 268 | solution: introduction of \textcolor{red}{ghost points} |
---|
| 269 | \vspace{5mm} |
---|
| 270 | |
---|
| 271 | \begin{flushright} |
---|
| 272 | \onslide<3-> $\left. \dfrac{\partial \psi}{\partial x} \right|_i = \dfrac{\psi_{i+1} - \psi_{i-1}}{2 \Delta x}$ |
---|
| 273 | \end{flushright} |
---|
| 274 | \vspace{10mm} |
---|
| 275 | \item<4-> FFT and linear equation solver cause \textcolor{red}{non-local data dependencies}\\ |
---|
| 276 | \ \\ |
---|
| 277 | solution: transposition of 3D-arrays |
---|
| 278 | \end{itemize} |
---|
| 279 | \end{column} |
---|
| 280 | \begin{column}{0.6\textwidth} |
---|
| 281 | \begin{center} |
---|
| 282 | \vspace{-5mm} |
---|
| 283 | \onslide<3-> \includegraphics[width=0.7\textwidth]{parallelization_figures/ghost_points.png} |
---|
| 284 | \vspace{4mm} |
---|
| 285 | \onslide<5-> \includegraphics[width=0.8\textwidth]{parallelization_figures/fft.png} \end{center} |
---|
| 286 | \vspace{-4mm} |
---|
[973] | 287 | \textbf{Example: transpositions for solving the Poisson\\ \hspace{4em}equation} |
---|
[945] | 288 | \end{column} |
---|
| 289 | \end{columns} |
---|
| 290 | \end{frame} |
---|
| 291 | |
---|
| 292 | % Folie 6 |
---|
| 293 | \begin{frame} |
---|
| 294 | \frametitle{How to Use the Parallelized Version of PALM} |
---|
| 295 | \scriptsize |
---|
| 296 | \begin{columns}[T] |
---|
| 297 | \begin{column}{1.12\textwidth} |
---|
| 298 | \begin{itemize} |
---|
| 299 | \item<1-> The parallel version of PALM is switched on by \texttt{mrun}-option ''\texttt{-K parallel}''. Additionally, the number of required processors and the number of tasks per node (number of PEs to be used on one node) have to be provided:\\ |
---|
| 300 | \quad \texttt{mrun ... -K parallel -X64 -T8 ...} |
---|
| 301 | \item<2-> From an accounting point of view, it is always most efficient to use all PEs of a node (\texttt{-T8}) (in case of a ''non-shared'' usage of nodes). |
---|
| 302 | \item<3-> If a normal unix-kernel operating system (not a micro-kernel) is running on each CPU, then there migth be a speed-up of the code, if 1-2 PEs less than the total number of PEs on the node are used. |
---|
| 303 | \item<4-> On machines with a comparably slow network, a 1D-decomposition (along $x$) should be used, because then only two transpositions have to be carried out by the pressure solver. A 1D-decomposition is automatically used for NEC-machines (e.g. \texttt{-h necriam}). The virtual processor grid to be used can be set manually by d3par-parameters \texttt{npex} and \texttt{npey}. |
---|
| 304 | \item<6-> Using the Open-MP parallelization does not yield any advantage over using a pure domain decomposition with MPI (contrary to expectations, it mostly slows down the computational speed), but this may change on cluster systems for very large number of processors ($>$10000?).\\ |
---|
| 305 | \end{itemize} |
---|
| 306 | \begin{center} |
---|
| 307 | \vspace{-7mm} |
---|
| 308 | \onslide<5-> \includegraphics[width=0.13\textwidth]{parallelization_figures/folie_6.png} |
---|
| 309 | \end{center} |
---|
| 310 | \end{column} |
---|
| 311 | \end{columns} |
---|
| 312 | \end{frame} |
---|
| 313 | |
---|
| 314 | % Folie 7 |
---|
| 315 | \begin{frame} |
---|
| 316 | \frametitle{MPI Communication} |
---|
| 317 | \scriptsize |
---|
| 318 | \begin{columns}[T] |
---|
| 319 | \begin{column}{1.12\textwidth} |
---|
| 320 | \begin{itemize} |
---|
| 321 | \item<1-> MPI (message passing interface) is a portable interface for communication between PEs (FORTRAN or C library). |
---|
| 322 | \vspace{2mm} |
---|
| 323 | \item<2-> To make MPI available on HLRNâs SGI-ICE, the module \texttt{mpt} must be loaded by setting the \texttt{\%modules} option in .mrun.config appropriately: |
---|
| 324 | |
---|
| 325 | \quad \texttt{\%modules ...:mpt:...} |
---|
| 326 | \vspace{2mm} |
---|
| 327 | \item<2-> The path to the MPI-library may have to be given in the compiler call, by setting an appropriate option in the configuration file .mrun.config: |
---|
| 328 | |
---|
| 329 | \quad \texttt{\%lopts -axW:-cpp:-r8:-nbs:-Vaxlib:\textcolor{blue}{-L:<replace by mpi library path>:-lmpi}} |
---|
| 330 | \vspace{2mm} |
---|
| 331 | \item<3-> All MPI calls must be within\\ |
---|
| 332 | \quad \texttt{CALL MPI\_INIT( ierror )}\\ |
---|
| 333 | \quad $\vdots$\\ |
---|
| 334 | \quad \texttt{CALL MPI\_FINALIZE( ierror )}\\ |
---|
| 335 | \end{itemize} |
---|
| 336 | |
---|
| 337 | \end{column} |
---|
| 338 | \end{columns} |
---|
| 339 | \end{frame} |
---|
| 340 | |
---|
| 341 | % Folie 8 |
---|
| 342 | \begin{frame} |
---|
| 343 | \frametitle{Communication Within PALM} |
---|
| 344 | \small |
---|
| 345 | \begin{itemize} |
---|
| 346 | \item<1-> MPI calls within PALM are available when using the \texttt{mrun}-option ''\texttt{-K parallel}''. |
---|
| 347 | \item<2-> Communication is needed for |
---|
| 348 | \begin{itemize} |
---|
| 349 | \footnotesize |
---|
| 350 | \item<2-> exchange of ghost points |
---|
| 351 | \item<3-> transpositions (FFT-poisson-solver) |
---|
| 352 | \item<4-> calculating global sums (e.g. for calculating horizontal averages) |
---|
| 353 | \end{itemize} |
---|
| 354 | \item<5-> Additional MPI calls are required to define the so-called virtual processor grid and to define special data types needed for more comfortable exchange of data. |
---|
| 355 | \end{itemize} |
---|
| 356 | \end{frame} |
---|
| 357 | |
---|
| 358 | % Folie 9 |
---|
| 359 | \begin{frame} |
---|
| 360 | \frametitle{Virtual Processor Grid Used in PALM} |
---|
[973] | 361 | \scriptsize |
---|
| 362 | \vspace{2mm} |
---|
| 363 | The processor grid and special data types are defined in file \texttt{init\_pegrid.f90}\\ |
---|
| 364 | \ \\ |
---|
[945] | 365 | \begin{itemize} |
---|
| 366 | \item<2-> PALM uses a two-dimensional virtual processor grid (in case of a 1D-decomposition, it has only one element along $y$). It is defined by a so called communicator (here: \texttt{comm2d}):\\ |
---|
[973] | 367 | \tiny |
---|
| 368 | \vspace{1.5mm} |
---|
[945] | 369 | \quad \texttt{ndim = 2}\\ |
---|
[973] | 370 | \quad \texttt{pdims(1) = npex \quad ! \# of processors along x}\\ |
---|
| 371 | \quad \texttt{pdims(2) = npey \quad ! \# of processors along y}\\ |
---|
[945] | 372 | \quad \texttt{cyclic(1) = .TRUE.}\\ |
---|
| 373 | \quad \texttt{cyclic(2) = .TRUE.}\\ |
---|
[973] | 374 | \ \\ |
---|
| 375 | \quad \texttt{CALL MPI\underline{\ }CART\underline{\ }CREATE( MPI\underline{\ }COMM\underline{\ }WORLD, ndim, pdims, cyclic, reorder, \&}\\ |
---|
| 376 | \quad \texttt{\hspace{10.5em} \textcolor{blue}{comm2d}, ierr )} |
---|
| 377 | \scriptsize |
---|
| 378 | \vspace{4mm} |
---|
[945] | 379 | \item<3-> The processor number (id) with respect to this processor grid, \texttt{myid}, is given by:\\ |
---|
[973] | 380 | \tiny |
---|
| 381 | \vspace{1.5mm} |
---|
| 382 | \quad \texttt{CALL MPI\underline{\ }COMM\underline{\ }RANK( comm2d, \textcolor{blue}{myid}, ierr )} |
---|
| 383 | \scriptsize |
---|
| 384 | \vspace{4mm} |
---|
[945] | 385 | \item<4-> The ids of the neighbouring PEs are determined by:\\ |
---|
[973] | 386 | \tiny |
---|
| 387 | \vspace{1.5mm} |
---|
[945] | 388 | \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 0, 1, \textcolor{blue}{pleft}, \textcolor{blue}{pright}, ierr )}\\ |
---|
[973] | 389 | \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 1, 1, \textcolor{blue}{psouth}, \textcolor{blue}{pnorth}, ierr )}\\ |
---|
[945] | 390 | \end{itemize} |
---|
| 391 | \end{frame} |
---|
| 392 | |
---|
| 393 | % Folie 10 |
---|
| 394 | \begin{frame} |
---|
| 395 | \frametitle{Exchange of ghost points} |
---|
| 396 | \scriptsize |
---|
| 397 | \begin{itemize} |
---|
| 398 | \item<1-> Ghost points are stored in additional array elements added at the horizontal boundaries of the subdomains, e.g.\\ |
---|
| 399 | \tiny |
---|
[973] | 400 | \vspace{2mm} |
---|
[945] | 401 | \quad \texttt{u(:,:,nxl\textcolor{blue}{-ngl}), u(:,:,nxr\textcolor{blue}{+ngl}) ! left and right boundary}\\ |
---|
| 402 | \quad \texttt{u(:,nys\textcolor{blue}{-ngl},:), u(:,nyn\textcolor{blue}{+ngl},:) ! south and north boundary}\\ |
---|
[973] | 403 | \vspace{4mm} |
---|
[945] | 404 | \item<2-> \scriptsize The exchange of ghost points is done in file \texttt{exchange\underline{\ }horiz.f90}\\ |
---|
| 405 | \textbf{\underline{Simplified} example:} synchroneous exchange of ghost points along $x$ ($yz$-planes, send left, receive right plane):\\ |
---|
| 406 | \tiny |
---|
[973] | 407 | \vspace{2mm} |
---|
[945] | 408 | \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-\textcolor{blue}{ngl},nxl), ngp\underline{\ }yz, MPI\underline{\ }REAL, pleft, 0,}\\ |
---|
| 409 | \quad \texttt{\hspace{9.5em}ar(nzb,nys-\textcolor{blue}{ngl},nxr+1), ngp\underline{\ }yz, MPI\underline{\ }REAL, pright, 0,}\\ |
---|
| 410 | \quad \texttt{\hspace{9.5em}comm2d, status, ierr )}\\ |
---|
[973] | 411 | \vspace{4mm} |
---|
[945] | 412 | \item<3-> \scriptsize In the real code special MPI data types (vectors) are defined for exchange of $yz$/$xz$-planes for performance reasons and because array elements to be exchanged are not consecutively stored in memory for $xz$-planes:\\ |
---|
| 413 | \tiny |
---|
[973] | 414 | \vspace{2mm} |
---|
[945] | 415 | \quad \texttt{ngp\underline{\ }yz(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * \textcolor{blue}{ngl} )}\\ |
---|
| 416 | \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }VECTOR( \textcolor{blue}{ngl}, ngp\underline{\ }yz(0), ngp\underline{\ }yz(0), MPI\underline{\ }REAL, type\underline{\ }yz(0), ierr )}\\ |
---|
[973] | 417 | \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ }yz(0), ierr ) ! see file init\underline{\ }pegrid.f90}\\ |
---|
[945] | 418 | \ \\ |
---|
| 419 | \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-ngl,nxl), type\underline{\ }yz(grid\underline{\ }level), MPI\underline{\ }REAL, pleft, 0, ...}\\ |
---|
| 420 | \end{itemize} |
---|
| 421 | \end{frame} |
---|
| 422 | |
---|
| 423 | % Folie 11 |
---|
| 424 | \begin{frame} |
---|
| 425 | \frametitle{Transpositions} |
---|
| 426 | \footnotesize |
---|
| 427 | \begin{columns}[T] |
---|
| 428 | \begin{column}{1.05\textwidth} |
---|
| 429 | \begin{itemize} |
---|
| 430 | \item<1-> Transpositions can be found in file \texttt{transpose.f90} (several subroutines for 1D- or 2D-decompositions; they are called mainly from the FFT pressure solver, see \texttt{poisfft.f90}.\\ |
---|
| 431 | \ \\ |
---|
| 432 | \item<2-> The following example is for a transposition from $x$ to $y$, i.e. for the input array all data elements along $x$ reside on the same PE, while after the transposition, all elements along $y$ are on the same PE:\\ |
---|
| 433 | \ \\ |
---|
| 434 | \scriptsize |
---|
| 435 | \texttt{!}\\ |
---|
| 436 | \texttt{!-- in SUBROUTINE transpose\underline{\ }xy:}\\ |
---|
| 437 | \texttt{CALL MPI\underline{\ }ALLTOALL( f\underline{\ }inv(nys\underline{\ }x,nzb\underline{\ }x,0), sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\ |
---|
| 438 | \texttt{\hspace{9.5em}work(1), \hspace{6.5em}sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\ |
---|
| 439 | \texttt{\hspace{9.5em}comm1dy, ierr )}\\ |
---|
| 440 | \ \\ |
---|
| 441 | \item<3-> The data resorting before and after the calls of MPI\_ALLTOALL is highly optimized to account for the different processor architectures. |
---|
| 442 | \end{itemize} |
---|
| 443 | \end{column} |
---|
| 444 | \end{columns} |
---|
| 445 | \end{frame} |
---|
| 446 | |
---|
| 447 | % Folie 12 |
---|
| 448 | \begin{frame} |
---|
| 449 | \frametitle{Parallel I/O} |
---|
| 450 | \scriptsize |
---|
[973] | 451 | \vspace{-2mm} |
---|
[945] | 452 | \begin{columns}[T] |
---|
| 453 | \begin{column}{1.1\textwidth} |
---|
| 454 | \begin{itemize} |
---|
| 455 | \item<1-> PALM writes and reads some of the input/output files in parallel, i.e. each processor writes/reads his own file. \textbf{Each file then has a different name!}\\ |
---|
| 456 | \ \\ |
---|
| 457 | \textbf{Example:} binary files for restart are written into a subdirectory of the PALM working directory:\\ |
---|
| 458 | \quad \texttt{BINOUT/\_0000}\\ |
---|
| 459 | \quad \texttt{BINOUT/\_0001}\\ |
---|
| 460 | \quad $\vdots$ |
---|
| 461 | \item<2-> These files can be handled (copied) by \texttt{mrun} using the file attribute \texttt{pe} in the configuration file:\\ |
---|
| 462 | \texttt{BINOUT out:loc:pe restart \~{}/palm/current\underline{\ }version/JOBS/\$fname/RESTART \underline{\ }d3d}\\ |
---|
| 463 | \ \\ |
---|
| 464 | \onslide<3->In this case, filenames are interpreted as directory names. |
---|
| 465 | An \texttt{mrun} call using option\\ |
---|
| 466 | ''\texttt{-d example\underline{\ }cbl -r restart}'' will copy the local \textbf{\underline{directory}} \texttt{BINOUT} to the \textbf{\underline{directory}} \texttt{.../RESTART/example\underline{\ }cbl\underline{\ }d3d} . |
---|
| 467 | \end{itemize} |
---|
| 468 | \onslide<4-> \textbf{General comment:} |
---|
| 469 | \begin{itemize} |
---|
[973] | 470 | \item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-options) |
---|
[945] | 471 | \end{itemize} |
---|
| 472 | \end{column} |
---|
| 473 | \end{columns} |
---|
| 474 | \end{frame} |
---|
| 475 | |
---|
| 476 | |
---|
| 477 | |
---|
| 478 | % Folie 13 |
---|
| 479 | \begin{frame} |
---|
| 480 | \frametitle{PALM Parallel I/O for 2D/3D Data} |
---|
| 481 | \footnotesize |
---|
| 482 | \begin{itemize} |
---|
| 483 | \item<1-> 2D- and 3D-data output is also written in parallel by the processors (2D: by default, 3D: generally). |
---|
| 484 | \item<2-> Because the graphics software (\texttt{ncview}, \texttt{ncl}, \texttt{ferret}, etc.) expect the data to be in one file, these output files have to be merged to one single file after PALM has finished.\\ |
---|
| 485 | \ \\ |
---|
| 486 | This is done within the job by calling the utility program \texttt{combine\underline{\ }plot\underline{\ }fields.x} after PALM has successfully finished. |
---|
| 487 | \item<3-> \texttt{combine\underline{\ }plot\underline{\ }fields.x} is automatically executed by \texttt{mrun}. |
---|
| 488 | \item<4-> The executable \texttt{combine\underline{\ }plot\underline{\ }fields.x} is created during the installation process by the command\\ |
---|
| 489 | \ \\ |
---|
| 490 | \quad \texttt{mbuild -u -h <host identifier>} |
---|
| 491 | \end{itemize} |
---|
| 492 | \end{frame} |
---|
| 493 | |
---|
| 494 | % Folie 14 |
---|
| 495 | \begin{frame} |
---|
| 496 | \frametitle{Performance Examples (I)} |
---|
| 497 | \begin{itemize} |
---|
| 498 | \item Simulation using 1536 * 768 * 242 grid points ($\sim$ 60 GByte) |
---|
| 499 | \end{itemize} |
---|
| 500 | \includegraphics[scale=0.28]{parallelization_figures/perf_left.png} |
---|
| 501 | \includegraphics[scale=0.28]{parallelization_figures/perf_right.png} |
---|
| 502 | \includegraphics[scale=0.28]{parallelization_figures/legende.png} \\ |
---|
| 503 | \scriptsize |
---|
| 504 | \begin{columns}[T] |
---|
| 505 | \begin{column}{0.18\textwidth} |
---|
| 506 | \end{column} |
---|
| 507 | \begin{column}{0.4\textwidth} |
---|
| 508 | IBM-Regatta, HLRN, Hannover\\ |
---|
| 509 | (1D domain decomposition) |
---|
| 510 | \end{column} |
---|
| 511 | \begin{column}{0.4\textwidth} |
---|
| 512 | Sun Fire X4600, Tokyo Institute of Technology\\ |
---|
| 513 | (2D domain decomposition) |
---|
| 514 | \end{column} |
---|
| 515 | \begin{column}{0.2\textwidth} |
---|
| 516 | \end{column} |
---|
| 517 | |
---|
| 518 | \end{columns} |
---|
| 519 | |
---|
| 520 | \end{frame} |
---|
| 521 | |
---|
| 522 | % Folie 15 |
---|
| 523 | \begin{frame} |
---|
| 524 | \frametitle{Performance Examples (II)} |
---|
| 525 | \begin{itemize} |
---|
| 526 | \item Simulation with $2048^3$ grid points ($\sim$ 2 TByte memory) |
---|
| 527 | \end{itemize} |
---|
| 528 | \begin{columns}[T] |
---|
| 529 | \begin{column}{0.5\textwidth} |
---|
| 530 | \includegraphics[scale=0.25]{parallelization_figures/perf_3.png} \\ |
---|
| 531 | \scriptsize |
---|
| 532 | \quad SGI-ICE2, HLRN-II, Hannover\\ |
---|
| 533 | \quad (2D-domain decomposition) |
---|
| 534 | \end{column} |
---|
| 535 | \begin{column}{0.5\textwidth} |
---|
| 536 | \vspace{35mm} |
---|
| 537 | \onslide<2-> currently largest simulation feasible on that system:\\ |
---|
| 538 | \ \\ |
---|
| 539 | $4096^3$ grid points |
---|
| 540 | \end{column} |
---|
| 541 | \end{columns} |
---|
| 542 | \end{frame} |
---|
| 543 | |
---|
| 544 | \end{document} |
---|