Changeset 973 for palm/trunk/TUTORIAL/SOURCE/parallelization.tex
- Timestamp:
- Aug 7, 2012 4:03:47 PM (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/TUTORIAL/SOURCE/parallelization.tex
r945 r973 108 108 109 109 \node (center) at (0,1) {}; 110 \onslide<2-> \node (Network) at (-3.5,1) [ ellipse,fill=green!20] {Network};110 \onslide<2-> \node (Network) at (-3.5,1) [draw, ellipse,fill=green!20] {Network}; 111 111 \node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-T3E)}; 112 112 \onslide<3-> \node (add_mem) at (3.5,1) [rectangle, draw] {adressable memory}; … … 116 116 \onslide<6-> \node (clustered_systems) at (0,-3) [draw, text width=0.15\textwidth] {clustered systems}; 117 117 \node (cs_info) at (0,-4.2) [text width=0.4\textwidth] {\footnotesize (IBM-Regatta, Linux-Cluster, 118 NEC-SX, SGI-ICE, Cray-X T4)};118 NEC-SX, SGI-ICE, Cray-XE6)}; 119 119 120 120 % Adressable memory node (big) … … 138 138 % Adressable memory node (small) 139 139 \onslide<4-> 140 \draw[->, thick] (1.5,0.2) -- (0.4,0.2) ; 141 \node at (0,0.2) [scale=0.2] {% 142 \begin{tikzpicture} 140 141 \node (small_node) at (-2,0.6) [scale=0.2] {% 142 \begin{tikzpicture} 143 144 \node (add_mem_small) at (3.5,0.9) [ultra thick, rectangle, draw, minimum width=3cm] {}; 145 146 \node (p1_small) at (2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 147 \node (p2_small) at (2.6,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 148 \node (p3_small) at (3.2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 149 \node (p4_small) at (3.8,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 150 \node (p5_small) at (4.4,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 151 \node (p6_small) at (5,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 143 152 144 \node (add_mem_small) at (3.5,0.9) [ultra thick, rectangle, draw, minimum width=3cm] {}; 145 \node (p1_small) at (2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 146 \node (p2_small) at (2.6,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 147 \node (p3_small) at (3.2,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 148 \node (p4_small) at (3.8,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 149 \node (p5_small) at (4.4,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 150 \node (p6_small) at (5,-0.05) [ultra thick, draw,circle, scale=0.9] {}; 151 152 \draw[-, ultra thick] (add_mem_small.south) -- (3.5,0.4); 153 \draw[-, ultra thick] (2,0.4) -- (5,0.4); 154 \draw[-, ultra thick] (2,0.4) -- (p1_small); 155 \draw[-, ultra thick] (2.6,0.4) -- (p2_small); 156 \draw[-, ultra thick] (3.2,0.4) -- (p3_small); 157 \draw[-, ultra thick] (3.8,0.4) -- (p4_small); 158 \draw[-, ultra thick] (4.4,0.4) -- (p5_small); 159 \draw[-, ultra thick] (5,0.4) -- (p6_small); 153 \draw[-, ultra thick] (add_mem_small.south) -- (3.5,0.4); 154 \draw[-, ultra thick] (2,0.4) -- (5,0.4); 155 \draw[-, ultra thick] (2,0.4) -- (p1_small); 156 \draw[-, ultra thick] (2.6,0.4) -- (p2_small); 157 \draw[-, ultra thick] (3.2,0.4) -- (p3_small); 158 \draw[-, ultra thick] (3.8,0.4) -- (p4_small); 159 \draw[-, ultra thick] (4.4,0.4) -- (p5_small); 160 \draw[-, ultra thick] (5,0.4) -- (p6_small); 161 162 163 \end{tikzpicture} 164 } ; 160 165 161 162 \end{tikzpicture} 163 } ; 164 166 \draw[->, thick] (1.5,0.2) -- (small_node) ; 167 \draw[-] (-2.7,0.75) -- (-2.3,0.725); 165 168 \onslide<5-> 166 \node (add_info) at (0,-0.1) [scale=0.9] {\scriptsize node};169 \node[below=-0.1cm of small_node] (add_info) [scale=0.9] {\scriptsize node}; 167 170 168 171 % Black Arrows … … 177 180 % OpenMP Arrows 178 181 \onslide<8-> \draw[->, ultra thick, color=yellow] (3.5,-2.6) -- (3.5,-1.5) ; 179 \draw[->, ultra thick, color=yellow] (2.5,-2.8) -- (-2. 5,0.7) ;182 \draw[->, ultra thick, color=yellow] (2.5,-2.8) -- (-2.0,0.1) ; 180 183 181 184 % Network decorations … … 209 212 \draw[-] (-2.45,1.0) -- (pr6); 210 213 \draw[-] (mem6) -- (pr6); 211 214 215 \onslide<1-> 212 216 \end{tikzpicture} 213 217 \end{center} … … 245 249 \begin{itemize} 246 250 \item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used in case of slow networks, but this generally doesn't scale for processor numbers $>$ 256. 251 \vspace{2mm} 247 252 \item<11-> Message passing is realized using MPI. 253 \vspace{2mm} 248 254 \item<12-> OpenMP parallelization as well as mixed usage of OpenMP and 249 255 MPI is also possible. (OpenMP tests and optimization is under way) … … 279 285 \onslide<5-> \includegraphics[width=0.8\textwidth]{parallelization_figures/fft.png} \end{center} 280 286 \vspace{-4mm} 281 \textbf{Example: transpositions for solving the poissonequation}287 \textbf{Example: transpositions for solving the Poisson\\ \hspace{4em}equation} 282 288 \end{column} 283 289 \end{columns} … … 353 359 \begin{frame} 354 360 \frametitle{Virtual Processor Grid Used in PALM} 355 \footnotesize 356 The processor grid and special data types are defined in file \texttt{init\_pegrid.f90} 361 \scriptsize 362 \vspace{2mm} 363 The processor grid and special data types are defined in file \texttt{init\_pegrid.f90}\\ 364 \ \\ 357 365 \begin{itemize} 358 366 \item<2-> PALM uses a two-dimensional virtual processor grid (in case of a 1D-decomposition, it has only one element along $y$). It is defined by a so called communicator (here: \texttt{comm2d}):\\ 359 \scriptsize 367 \tiny 368 \vspace{1.5mm} 360 369 \quad \texttt{ndim = 2}\\ 361 \quad \texttt{pdims(1) = npex ! \# of processors along x}\\362 \quad \texttt{pdims(2) = npey ! \# of processors along y}\\370 \quad \texttt{pdims(1) = npex \quad ! \# of processors along x}\\ 371 \quad \texttt{pdims(2) = npey \quad ! \# of processors along y}\\ 363 372 \quad \texttt{cyclic(1) = .TRUE.}\\ 364 373 \quad \texttt{cyclic(2) = .TRUE.}\\ 365 366 \quad \texttt{CALL MPI\underline{\ }CART\underline{\ }CREATE( MPI\underline{\ }COMM\underline{\ }WORLD, ndim, pdims, cyclic, \&}\\ 367 \quad \texttt{\hspace{10.5em} reorder, \textcolor{blue}{comm2d}, ierr )} 374 \ \\ 375 \quad \texttt{CALL MPI\underline{\ }CART\underline{\ }CREATE( MPI\underline{\ }COMM\underline{\ }WORLD, ndim, pdims, cyclic, reorder, \&}\\ 376 \quad \texttt{\hspace{10.5em} \textcolor{blue}{comm2d}, ierr )} 377 \scriptsize 378 \vspace{4mm} 368 379 \item<3-> The processor number (id) with respect to this processor grid, \texttt{myid}, is given by:\\ 369 \scriptsize 370 \quad \texttt{CALL MPI\underline{\ }COMM\underline{\ }RANK( comm2d, \textcolor{blue}{myid}, ierr )} 380 \tiny 381 \vspace{1.5mm} 382 \quad \texttt{CALL MPI\underline{\ }COMM\underline{\ }RANK( comm2d, \textcolor{blue}{myid}, ierr )} 383 \scriptsize 384 \vspace{4mm} 371 385 \item<4-> The ids of the neighbouring PEs are determined by:\\ 372 \scriptsize 386 \tiny 387 \vspace{1.5mm} 373 388 \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 0, 1, \textcolor{blue}{pleft}, \textcolor{blue}{pright}, ierr )}\\ 374 \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 1, 1, \textcolor{blue}{psouth}, \textcolor{blue}{pnorth}, ierr )} 389 \quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 1, 1, \textcolor{blue}{psouth}, \textcolor{blue}{pnorth}, ierr )}\\ 375 390 \end{itemize} 376 391 \end{frame} … … 383 398 \item<1-> Ghost points are stored in additional array elements added at the horizontal boundaries of the subdomains, e.g.\\ 384 399 \tiny 400 \vspace{2mm} 385 401 \quad \texttt{u(:,:,nxl\textcolor{blue}{-ngl}), u(:,:,nxr\textcolor{blue}{+ngl}) ! left and right boundary}\\ 386 402 \quad \texttt{u(:,nys\textcolor{blue}{-ngl},:), u(:,nyn\textcolor{blue}{+ngl},:) ! south and north boundary}\\ 387 \ hspace{3mm}403 \vspace{4mm} 388 404 \item<2-> \scriptsize The exchange of ghost points is done in file \texttt{exchange\underline{\ }horiz.f90}\\ 389 405 \textbf{\underline{Simplified} example:} synchroneous exchange of ghost points along $x$ ($yz$-planes, send left, receive right plane):\\ 390 406 \tiny 407 \vspace{2mm} 391 408 \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-\textcolor{blue}{ngl},nxl), ngp\underline{\ }yz, MPI\underline{\ }REAL, pleft, 0,}\\ 392 409 \quad \texttt{\hspace{9.5em}ar(nzb,nys-\textcolor{blue}{ngl},nxr+1), ngp\underline{\ }yz, MPI\underline{\ }REAL, pright, 0,}\\ 393 410 \quad \texttt{\hspace{9.5em}comm2d, status, ierr )}\\ 394 \ hspace{3mm}411 \vspace{4mm} 395 412 \item<3-> \scriptsize In the real code special MPI data types (vectors) are defined for exchange of $yz$/$xz$-planes for performance reasons and because array elements to be exchanged are not consecutively stored in memory for $xz$-planes:\\ 396 413 \tiny 414 \vspace{2mm} 397 415 \quad \texttt{ngp\underline{\ }yz(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * \textcolor{blue}{ngl} )}\\ 398 416 \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }VECTOR( \textcolor{blue}{ngl}, ngp\underline{\ }yz(0), ngp\underline{\ }yz(0), MPI\underline{\ }REAL, type\underline{\ }yz(0), ierr )}\\ 399 \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ } xz(0), ierr ) ! see file init\underline{\ }pegrid.f90}\\417 \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ }yz(0), ierr ) ! see file init\underline{\ }pegrid.f90}\\ 400 418 \ \\ 401 419 \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-ngl,nxl), type\underline{\ }yz(grid\underline{\ }level), MPI\underline{\ }REAL, pleft, 0, ...}\\ … … 431 449 \frametitle{Parallel I/O} 432 450 \scriptsize 451 \vspace{-2mm} 433 452 \begin{columns}[T] 434 453 \begin{column}{1.1\textwidth} … … 449 468 \onslide<4-> \textbf{General comment:} 450 469 \begin{itemize} 451 \item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems). A workaround for this problem will\\ be available soon.470 \item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-options) 452 471 \end{itemize} 453 472 \end{column}
Note: See TracChangeset
for help on using the changeset viewer.