Home

Context Navigation

source: palm/trunk/TUTORIAL/SOURCE/parallelization.tex @ 988

Last change on this file since 988 was 973, checked in by maronga, 12 years ago
tutorial updates
Property svn:executable set to ``* Property svn:keywords set to `Id`
File size: 24.9 KB

Rev	Line
[945]	1	% $Id: parallelization.tex 973 2012-08-07 16:03:47Z heinze $
	2	\input{header_tmp.tex}
	3	%\input{../header_lectures.tex}
	4
	5	\usepackage[utf8]{inputenc}
	6	\usepackage{ngerman}
	7	\usepackage{pgf}
	8	\usetheme{Dresden}
	9	\usepackage{subfigure}
	10	\usepackage{units}
	11	\usepackage{multimedia}
	12	\usepackage{hyperref}
	13	\newcommand{\event}[1]{\newcommand{\eventname}{#1}}
	14	\usepackage{xmpmulti}
	15	\usepackage{tikz}
	16	\usetikzlibrary{shapes,arrows,positioning}
	17	\usetikzlibrary{decorations.markings} %neues paket
	18	\usetikzlibrary{decorations.pathreplacing} %neues paket
	19	\def\Tiny{\fontsize{4pt}{4pt}\selectfont}
	20	\usepackage{amsmath}
	21	\usepackage{amssymb}
	22	\usepackage{multicol}
	23	\usepackage{pdfcomment}
	24	\usepackage{graphicx}
	25	\usepackage{listings}
	26	\lstset{showspaces=false,language=fortran,basicstyle=
	27	\ttfamily,showstringspaces=false,captionpos=b}
	28
	29	\institute{Institut fÃŒr Meteorologie und Klimatologie, Leibniz UniversitÃ€t Hannover}
	30	\date{last update: \today}
	31	\event{PALM Seminar}
	32	\setbeamertemplate{navigation symbols}{}
	33
	34	\setbeamertemplate{footline}
	35	{
	36	\begin{beamercolorbox}[rightskip=-0.1cm]&
	37	{\includegraphics[height=0.65cm]{imuk_logo.pdf}\hfill \includegraphics[height=0.65cm]{luh_logo.pdf}}
	38	\end{beamercolorbox}
	39	\begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,
	40	leftskip=.3cm,rightskip=0.3cm plus1fil]{title in head/foot}
	41	{\leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor} \hfill \eventname \hfill \insertframenumber \; / \inserttotalframenumber}
	42	\end{beamercolorbox}
	43	\begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot}
	44	\end{beamercolorbox}
	45	}
	46	%\logo{\includegraphics[width=0.3\textwidth]{luhimuk_logo.pdf}}
	47
	48	\title[Parallelization]{Parallelization}
	49	\author{Siegfried Raasch}
	50
	51	\begin{document}
	52
	53	% Folie 1
	54	\begin{frame}
	55	\titlepage
	56	\end{frame}
	57
	58	\section{Parallelization}
	59	\subsection{Parallelization}
	60
	61	% Folie 2
	62	\begin{frame}
	63	\frametitle{Basics of Parallelization}
	64	\tikzstyle{yellow} = [rectangle, fill=yellow!20, text width=0.6\textwidth, font=\scriptsize]
	65	\scriptsize
	66	\textbf{Parallelization:}
	67	\begin{itemize}
	68	\item<2-> All processor elements (PE, core) are carrying out the same program (SIMD).
	69	\item<3-> Each PE of a parallel computer operates on a different set of data.
	70	\end{itemize}
	71
	72	\ \\
	73	\onslide<4->\textbf{Realization:}
	74	\begin{columns}[T]
	75	\begin{column}{0.45\textwidth}
	76	\onslide<5->each PE solves the equations for a different subdomain of the total domain
	77	\begin{center}
	78	\includegraphics[width=0.5\textwidth]{parallelization_figures/subdomain.png}
	79	\end{center}
	80	\onslide<7->each PE only knows the variable values from its subdomain, communication / data exchange between PEs is necessary\\
	81	\onslide<9->\textbf{message passing model (MPI)}
	82	\end{column}
	83	\begin{column}{0.45\textwidth}
	84	\onslide<6->program loops are parallelized, i.e. each processor solves for a subset of the total index range
	85	\begin{center}
	86	\begin{tikzpicture}[auto, node distance=0]
	87	\node [yellow] (1) {%
	88	\texttt{!\$OMP DO}\\
	89	\texttt{\quad \quad DO i = 1, 100}\\
	90	\quad \quad \quad $\vdots$\\
	91	\texttt{\quad \quad ENDDO}};
	92	\end{tikzpicture}
	93	\end{center}
	94	\onslide<8-> parallelization can easily be done by the compiler, if all PEs have access to all variables (shared memory)\\
	95	\onslide<10-> \textbf{shared memory model (OpenMP)}
	96	\end{column}
	97	\end{columns}
	98	\end{frame}
	99
	100	% Folie 3
	101	\begin{frame}
	102	\frametitle{Basic Architectures of Massively Parallel Computers}
	103	\tikzstyle{info} = [rectangle, text width=0.25\textwidth, font=\scriptsize]
	104
	105
	106	\begin{center}
	107	\begin{tikzpicture}
	108
	109	\node (center) at (0,1) {};
[973]	110	\onslide<2-> \node (Network) at (-3.5,1) [draw, ellipse,fill=green!20] {Network};
[945]	111	\node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-T3E)};
	112	\onslide<3-> \node (add_mem) at (3.5,1) [rectangle, draw] {adressable memory};
	113	\node (sha_mem) at (3.5,-1) [text width=0.35\textwidth] {\footnotesize \textbf{shared} memory\\(SGI-Altix, multicore PCs)};
	114	\onslide<7-> \node (MPI) at (-3.5,-3) [ellipse,fill=yellow!90] {MPI};
	115	\onslide<8-> \node (OpenMP) at (3.5,-3) [ellipse,fill=yellow!90] {OpenMP};
	116	\onslide<6-> \node (clustered_systems) at (0,-3) [draw, text width=0.15\textwidth] {clustered systems};
	117	\node (cs_info) at (0,-4.2) [text width=0.4\textwidth] {\footnotesize (IBM-Regatta, Linux-Cluster,
[973]	118	NEC-SX, SGI-ICE, Cray-XE6)};
[945]	119
	120	% Adressable memory node (big)
	121
	122	\onslide<3-> \node (p1) at (2,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
	123	\node (p2) at (2.6,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
	124	\node (p3) at (3.2,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
	125	\node (p4) at (3.8,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
	126	\node (p5) at (4.4,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
	127	\node (p6) at (5,-0.05) [draw,circle, scale=0.9] {\scriptsize p};
	128
	129	\draw[-] (3.5,0.7) -- (3.5,0.4);
	130	\draw[-] (2,0.4) -- (5,0.4);
	131	\draw[-] (2,0.4) -- (p1);
	132	\draw[-] (2.6,0.4) -- (p2);
	133	\draw[-] (3.2,0.4) -- (p3);
	134	\draw[-] (3.8,0.4) -- (p4);
	135	\draw[-] (4.4,0.4) -- (p5);
	136	\draw[-] (5,0.4) -- (p6);
	137
	138	% Adressable memory node (small)
	139	\onslide<4->
[973]	140
	141	\node (small_node) at (-2,0.6) [scale=0.2] {%
	142	\begin{tikzpicture}
	143
	144	\node (add_mem_small) at (3.5,0.9) [ultra thick, rectangle, draw, minimum width=3cm] {};
	145
	146	\node (p1_small) at (2,-0.05) [ultra thick, draw,circle, scale=0.9] {};
	147	\node (p2_small) at (2.6,-0.05) [ultra thick, draw,circle, scale=0.9] {};
	148	\node (p3_small) at (3.2,-0.05) [ultra thick, draw,circle, scale=0.9] {};
	149	\node (p4_small) at (3.8,-0.05) [ultra thick, draw,circle, scale=0.9] {};
	150	\node (p5_small) at (4.4,-0.05) [ultra thick, draw,circle, scale=0.9] {};
	151	\node (p6_small) at (5,-0.05) [ultra thick, draw,circle, scale=0.9] {};
[945]	152
[973]	153	\draw[-, ultra thick] (add_mem_small.south) -- (3.5,0.4);
	154	\draw[-, ultra thick] (2,0.4) -- (5,0.4);
	155	\draw[-, ultra thick] (2,0.4) -- (p1_small);
	156	\draw[-, ultra thick] (2.6,0.4) -- (p2_small);
	157	\draw[-, ultra thick] (3.2,0.4) -- (p3_small);
	158	\draw[-, ultra thick] (3.8,0.4) -- (p4_small);
	159	\draw[-, ultra thick] (4.4,0.4) -- (p5_small);
	160	\draw[-, ultra thick] (5,0.4) -- (p6_small);
	161
	162
	163	\end{tikzpicture}
	164	} ;
[945]	165
[973]	166	\draw[->, thick] (1.5,0.2) -- (small_node) ;
	167	\draw[-] (-2.7,0.75) -- (-2.3,0.725);
[945]	168	\onslide<5->
[973]	169	\node[below=-0.1cm of small_node] (add_info) [scale=0.9] {\scriptsize node};
[945]	170
	171	% Black Arrows
	172	\onslide<6-> \draw[->, thick] (-2.5,-1.5) -- (-0.8,-2.2) ;
	173	\draw[->, thick] (2.5,-1.5) -- (0.8,-2.2) ;
	174
	175	% MPI Arrows
	176	\onslide<7-> \draw[->, ultra thick, color=yellow] (-3.5,-2.7) -- (-3.5,-1.5) ;
	177	\draw[->, ultra thick, color=yellow] (-2.9,-3) -- (-1.0,-3.0) ;
	178	\draw[->, ultra thick, color=yellow] (-3.0,-2.8) -- (1.5,-1.0) ;
	179
	180	% OpenMP Arrows
	181	\onslide<8-> \draw[->, ultra thick, color=yellow] (3.5,-2.6) -- (3.5,-1.5) ;
[973]	182	\draw[->, ultra thick, color=yellow] (2.5,-2.8) -- (-2.0,0.1) ;
[945]	183
	184	% Network decorations
	185	\onslide<2-> \node (pr1) at (-4.6,0.7) [draw,circle,scale=0.5] {};
	186	\node (mem1) at (-4.6,0.45) [draw,rectangle,scale=0.5] {};
	187	\draw[-] (-4.5,0.9) -- (pr1);
	188	\draw[-] (mem1) -- (pr1);
	189	\draw[-] ([xshift=0.02cm]pr1.east) -- ([xshift=0.3cm, yshift=-0.2cm]pr1.east);
	190	\draw[-] ([xshift=0.02cm]mem1.east) -- ([xshift=0.3cm, yshift=-0.2cm]mem1.east);
	191	\node at (-3.7,0.45) {\tiny processor};
	192	\node at (-3.3,0.25) {\tiny adressable memory};
	193
	194	\node (pr2) at (-4.6,1.3) [draw,circle,scale=0.5] {};
	195	\node (mem2) at (-4.6,1.55) [draw,rectangle,scale=0.5] {};
	196	\draw[-] (-4.5,1.1) -- (pr2);
	197	\draw[-] (mem2) -- (pr2);
	198	\node (pr3) at (-3.9,1.5) [draw,circle,scale=0.5] {};
	199	\node (mem3) at (-3.9,1.75) [draw,rectangle,scale=0.5] {};
	200	\draw[-] (-3.8,1.3) -- (pr3);
	201	\draw[-] (mem3) -- (pr3);
	202	\node (pr4) at (-4.9,0.95) [draw,circle,scale=0.5] {};
	203	\node (mem4) at (-4.9,0.7) [draw,rectangle,scale=0.5] {};
	204	\draw[-] (-4.55,1.0) -- (pr4);
	205	\draw[-] (mem4) -- (pr4);
	206	\node (pr5) at (-3.0,1.5) [draw,circle,scale=0.5] {};
	207	\node (mem5) at (-3.0,1.75) [draw,rectangle,scale=0.5] {};
	208	\draw[-] (-3.08,1.3) -- (pr5);
	209	\draw[-] (mem5) -- (pr5);
	210	\node (pr6) at (-2.2,1.1) [draw,circle,scale=0.5] {};
	211	\node (mem6) at (-2.2,1.35) [draw,rectangle,scale=0.5] {};
	212	\draw[-] (-2.45,1.0) -- (pr6);
	213	\draw[-] (mem6) -- (pr6);
[973]	214
	215	\onslide<1->
[945]	216	\end{tikzpicture}
	217	\end{center}
	218
	219	\end{frame}
	220
	221	% Folie 4
	222	\begin{frame}
	223	\frametitle{PALM Parallelization Model}
	224	\scriptsize
	225	\onslide<2-> \textbf{General demands for a parallelized program:}
	226	\begin{itemize}
	227	\item<3-> Load balancing
	228	\item<4-> Small communication overhead
	229	\item<5-> Scalability (up to large numbers of processors)
	230	\end{itemize}
	231	\vspace{2mm}
	232	\onslide<6-> \textbf{The basic parallelization method used for PALM is a 2D-domain decomposition along $x$ and $y$:}\\
	233	\begin{columns}[T]
	234	\begin{column}{0.3\textwidth}
	235	\onslide<7-> \includegraphics[width=1.0\textwidth]{parallelization_figures/subdomain.png}
	236	\end{column}
	237	\begin{column}{0.6\textwidth}
	238	\ \\
	239	\onslide<8-> contiguous data in memory (FORTRAN):\\
	240	\ \\
	241	\ \\
	242	\textcolor{blue}{columns of i}\\
	243	\textcolor{red}{no contiguous data at all}\\
	244	\onslide<9-> \textcolor{blue}{columns of k}\\
	245	\textcolor{red}{planes of k,j (all data contiguous)}
	246	\end{column}
	247	\end{columns}
	248	\vspace{2mm}
	249	\begin{itemize}
	250	\item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used in case of slow networks, but this generally doesn't scale for processor numbers $>$ 256.
[973]	251	\vspace{2mm}
[945]	252	\item<11-> Message passing is realized using MPI.
[973]	253	\vspace{2mm}
[945]	254	\item<12-> OpenMP parallelization as well as mixed usage of OpenMP and
	255	MPI is also possible. (OpenMP tests and optimization is under way)
	256	\end{itemize}
	257	\end{frame}
	258
	259	% Folie 5
	260	\begin{frame}
	261	\frametitle{Implications of Decomposition}
	262	\scriptsize
	263	\begin{columns}[T]
	264	\begin{column}{0.5\textwidth}
	265	\begin{itemize}
	266	\item<2-> Central finite differences cause \textcolor{red}{local data dependencies}\\
	267	\ \\
	268	solution: introduction of \textcolor{red}{ghost points}
	269	\vspace{5mm}
	270
	271	\begin{flushright}
	272	\onslide<3-> $\left. \dfrac{\partial \psi}{\partial x} \right\|_i = \dfrac{\psi_{i+1} - \psi_{i-1}}{2 \Delta x}$
	273	\end{flushright}
	274	\vspace{10mm}
	275	\item<4-> FFT and linear equation solver cause \textcolor{red}{non-local data dependencies}\\
	276	\ \\
	277	solution: transposition of 3D-arrays
	278	\end{itemize}
	279	\end{column}
	280	\begin{column}{0.6\textwidth}
	281	\begin{center}
	282	\vspace{-5mm}
	283	\onslide<3-> \includegraphics[width=0.7\textwidth]{parallelization_figures/ghost_points.png}
	284	\vspace{4mm}
	285	\onslide<5-> \includegraphics[width=0.8\textwidth]{parallelization_figures/fft.png} \end{center}
	286	\vspace{-4mm}
[973]	287	\textbf{Example: transpositions for solving the Poisson\\ \hspace{4em}equation}
[945]	288	\end{column}
	289	\end{columns}
	290	\end{frame}
	291
	292	% Folie 6
	293	\begin{frame}
	294	\frametitle{How to Use the Parallelized Version of PALM}
	295	\scriptsize
	296	\begin{columns}[T]
	297	\begin{column}{1.12\textwidth}
	298	\begin{itemize}
	299	\item<1-> The parallel version of PALM is switched on by \texttt{mrun}-option ''\texttt{-K parallel}''. Additionally, the number of required processors and the number of tasks per node (number of PEs to be used on one node) have to be provided:\\
	300	\quad \texttt{mrun ... -K parallel -X64 -T8 ...}
	301	\item<2-> From an accounting point of view, it is always most efficient to use all PEs of a node (\texttt{-T8}) (in case of a ''non-shared'' usage of nodes).
	302	\item<3-> If a normal unix-kernel operating system (not a micro-kernel) is running on each CPU, then there migth be a speed-up of the code, if 1-2 PEs less than the total number of PEs on the node are used.
	303	\item<4-> On machines with a comparably slow network, a 1D-decomposition (along $x$) should be used, because then only two transpositions have to be carried out by the pressure solver. A 1D-decomposition is automatically used for NEC-machines (e.g. \texttt{-h necriam}). The virtual processor grid to be used can be set manually by d3par-parameters \texttt{npex} and \texttt{npey}.
	304	\item<6-> Using the Open-MP parallelization does not yield any advantage over using a pure domain decomposition with MPI (contrary to expectations, it mostly slows down the computational speed), but this may change on cluster systems for very large number of processors ($>$10000?).\\
	305	\end{itemize}
	306	\begin{center}
	307	\vspace{-7mm}
	308	\onslide<5-> \includegraphics[width=0.13\textwidth]{parallelization_figures/folie_6.png}
	309	\end{center}
	310	\end{column}
	311	\end{columns}
	312	\end{frame}
	313
	314	% Folie 7
	315	\begin{frame}
	316	\frametitle{MPI Communication}
	317	\scriptsize
	318	\begin{columns}[T]
	319	\begin{column}{1.12\textwidth}
	320	\begin{itemize}
	321	\item<1-> MPI (message passing interface) is a portable interface for communication between PEs (FORTRAN or C library).
	322	\vspace{2mm}
	323	\item<2-> To make MPI available on HLRNâs SGI-ICE, the module \texttt{mpt} must be loaded by setting the \texttt{\%modules} option in .mrun.config appropriately:
	324
	325	\quad \texttt{\%modules ...:mpt:...}
	326	\vspace{2mm}
	327	\item<2-> The path to the MPI-library may have to be given in the compiler call, by setting an appropriate option in the configuration file .mrun.config:
	328
	329	\quad \texttt{\%lopts -axW:-cpp:-r8:-nbs:-Vaxlib:\textcolor{blue}{-L:<replace by mpi library path>:-lmpi}}
	330	\vspace{2mm}
	331	\item<3-> All MPI calls must be within\\
	332	\quad \texttt{CALL MPI\_INIT( ierror )}\\
	333	\quad $\vdots$\\
	334	\quad \texttt{CALL MPI\_FINALIZE( ierror )}\\
	335	\end{itemize}
	336
	337	\end{column}
	338	\end{columns}
	339	\end{frame}
	340
	341	% Folie 8
	342	\begin{frame}
	343	\frametitle{Communication Within PALM}
	344	\small
	345	\begin{itemize}
	346	\item<1-> MPI calls within PALM are available when using the \texttt{mrun}-option ''\texttt{-K parallel}''.
	347	\item<2-> Communication is needed for
	348	\begin{itemize}
	349	\footnotesize
	350	\item<2-> exchange of ghost points
	351	\item<3-> transpositions (FFT-poisson-solver)
	352	\item<4-> calculating global sums (e.g. for calculating horizontal averages)
	353	\end{itemize}
	354	\item<5-> Additional MPI calls are required to define the so-called virtual processor grid and to define special data types needed for more comfortable exchange of data.
	355	\end{itemize}
	356	\end{frame}
	357
	358	% Folie 9
	359	\begin{frame}
	360	\frametitle{Virtual Processor Grid Used in PALM}
[973]	361	\scriptsize
	362	\vspace{2mm}
	363	The processor grid and special data types are defined in file \texttt{init\_pegrid.f90}\\
	364	\ \\
[945]	365	\begin{itemize}
	366	\item<2-> PALM uses a two-dimensional virtual processor grid (in case of a 1D-decomposition, it has only one element along $y$). It is defined by a so called communicator (here: \texttt{comm2d}):\\
[973]	367	\tiny
	368	\vspace{1.5mm}
[945]	369	\quad \texttt{ndim = 2}\\
[973]	370	\quad \texttt{pdims(1) = npex \quad ! \# of processors along x}\\
	371	\quad \texttt{pdims(2) = npey \quad ! \# of processors along y}\\
[945]	372	\quad \texttt{cyclic(1) = .TRUE.}\\
	373	\quad \texttt{cyclic(2) = .TRUE.}\\
[973]	374	\ \\
	375	\quad \texttt{CALL MPI\underline{\ }CART\underline{\ }CREATE( MPI\underline{\ }COMM\underline{\ }WORLD, ndim, pdims, cyclic, reorder, \&}\\
	376	\quad \texttt{\hspace{10.5em} \textcolor{blue}{comm2d}, ierr )}
	377	\scriptsize
	378	\vspace{4mm}
[945]	379	\item<3-> The processor number (id) with respect to this processor grid, \texttt{myid}, is given by:\\
[973]	380	\tiny
	381	\vspace{1.5mm}
	382	\quad \texttt{CALL MPI\underline{\ }COMM\underline{\ }RANK( comm2d, \textcolor{blue}{myid}, ierr )}
	383	\scriptsize
	384	\vspace{4mm}
[945]	385	\item<4-> The ids of the neighbouring PEs are determined by:\\
[973]	386	\tiny
	387	\vspace{1.5mm}
[945]	388	\quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 0, 1, \textcolor{blue}{pleft}, \textcolor{blue}{pright}, ierr )}\\
[973]	389	\quad \texttt{CALL MPI\underline{\ }CARD\underline{\ }SHIFT( comm2d, 1, 1, \textcolor{blue}{psouth}, \textcolor{blue}{pnorth}, ierr )}\\
[945]	390	\end{itemize}
	391	\end{frame}
	392
	393	% Folie 10
	394	\begin{frame}
	395	\frametitle{Exchange of ghost points}
	396	\scriptsize
	397	\begin{itemize}
	398	\item<1-> Ghost points are stored in additional array elements added at the horizontal boundaries of the subdomains, e.g.\\
	399	\tiny
[973]	400	\vspace{2mm}
[945]	401	\quad \texttt{u(:,:,nxl\textcolor{blue}{-ngl}), u(:,:,nxr\textcolor{blue}{+ngl}) ! left and right boundary}\\
	402	\quad \texttt{u(:,nys\textcolor{blue}{-ngl},:), u(:,nyn\textcolor{blue}{+ngl},:) ! south and north boundary}\\
[973]	403	\vspace{4mm}
[945]	404	\item<2-> \scriptsize The exchange of ghost points is done in file \texttt{exchange\underline{\ }horiz.f90}\\
	405	\textbf{\underline{Simplified} example:} synchroneous exchange of ghost points along $x$ ($yz$-planes, send left, receive right plane):\\
	406	\tiny
[973]	407	\vspace{2mm}
[945]	408	\quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-\textcolor{blue}{ngl},nxl), ngp\underline{\ }yz, MPI\underline{\ }REAL, pleft, 0,}\\
	409	\quad \texttt{\hspace{9.5em}ar(nzb,nys-\textcolor{blue}{ngl},nxr+1), ngp\underline{\ }yz, MPI\underline{\ }REAL, pright, 0,}\\
	410	\quad \texttt{\hspace{9.5em}comm2d, status, ierr )}\\
[973]	411	\vspace{4mm}
[945]	412	\item<3-> \scriptsize In the real code special MPI data types (vectors) are defined for exchange of $yz$/$xz$-planes for performance reasons and because array elements to be exchanged are not consecutively stored in memory for $xz$-planes:\\
	413	\tiny
[973]	414	\vspace{2mm}
[945]	415	\quad \texttt{ngp\underline{\ }yz(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * \textcolor{blue}{ngl} )}\\
	416	\quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }VECTOR( \textcolor{blue}{ngl}, ngp\underline{\ }yz(0), ngp\underline{\ }yz(0), MPI\underline{\ }REAL, type\underline{\ }yz(0), ierr )}\\
[973]	417	\quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ }yz(0), ierr ) ! see file init\underline{\ }pegrid.f90}\\
[945]	418	\ \\
	419	\quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-ngl,nxl), type\underline{\ }yz(grid\underline{\ }level), MPI\underline{\ }REAL, pleft, 0, ...}\\
	420	\end{itemize}
	421	\end{frame}
	422
	423	% Folie 11
	424	\begin{frame}
	425	\frametitle{Transpositions}
	426	\footnotesize
	427	\begin{columns}[T]
	428	\begin{column}{1.05\textwidth}
	429	\begin{itemize}
	430	\item<1-> Transpositions can be found in file \texttt{transpose.f90} (several subroutines for 1D- or 2D-decompositions; they are called mainly from the FFT pressure solver, see \texttt{poisfft.f90}.\\
	431	\ \\
	432	\item<2-> The following example is for a transposition from $x$ to $y$, i.e. for the input array all data elements along $x$ reside on the same PE, while after the transposition, all elements along $y$ are on the same PE:\\
	433	\ \\
	434	\scriptsize
	435	\texttt{!}\\
	436	\texttt{!-- in SUBROUTINE transpose\underline{\ }xy:}\\
	437	\texttt{CALL MPI\underline{\ }ALLTOALL( f\underline{\ }inv(nys\underline{\ }x,nzb\underline{\ }x,0), sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\
	438	\texttt{\hspace{9.5em}work(1), \hspace{6.5em}sendrecvcount\underline{\ }xy, MPI\underline{\ }REAL, \&}\\
	439	\texttt{\hspace{9.5em}comm1dy, ierr )}\\
	440	\ \\
	441	\item<3-> The data resorting before and after the calls of MPI\_ALLTOALL is highly optimized to account for the different processor architectures.
	442	\end{itemize}
	443	\end{column}
	444	\end{columns}
	445	\end{frame}
	446
	447	% Folie 12
	448	\begin{frame}
	449	\frametitle{Parallel I/O}
	450	\scriptsize
[973]	451	\vspace{-2mm}
[945]	452	\begin{columns}[T]
	453	\begin{column}{1.1\textwidth}
	454	\begin{itemize}
	455	\item<1-> PALM writes and reads some of the input/output files in parallel, i.e. each processor writes/reads his own file. \textbf{Each file then has a different name!}\\
	456	\ \\
	457	\textbf{Example:} binary files for restart are written into a subdirectory of the PALM working directory:\\
	458	\quad \texttt{BINOUT/\_0000}\\
	459	\quad \texttt{BINOUT/\_0001}\\
	460	\quad $\vdots$
	461	\item<2-> These files can be handled (copied) by \texttt{mrun} using the file attribute \texttt{pe} in the configuration file:\\
	462	\texttt{BINOUT out:loc:pe restart \~{}/palm/current\underline{\ }version/JOBS/\$fname/RESTART \underline{\ }d3d}\\
	463	\ \\
	464	\onslide<3->In this case, filenames are interpreted as directory names.
	465	An \texttt{mrun} call using option\\
	466	''\texttt{-d example\underline{\ }cbl -r restart}'' will copy the local \textbf{\underline{directory}} \texttt{BINOUT} to the \textbf{\underline{directory}} \texttt{.../RESTART/example\underline{\ }cbl\underline{\ }d3d} .
	467	\end{itemize}
	468	\onslide<4-> \textbf{General comment:}
	469	\begin{itemize}
[973]	470	\item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-options)
[945]	471	\end{itemize}
	472	\end{column}
	473	\end{columns}
	474	\end{frame}
	475
	476
	477
	478	% Folie 13
	479	\begin{frame}
	480	\frametitle{PALM Parallel I/O for 2D/3D Data}
	481	\footnotesize
	482	\begin{itemize}
	483	\item<1-> 2D- and 3D-data output is also written in parallel by the processors (2D: by default, 3D: generally).
	484	\item<2-> Because the graphics software (\texttt{ncview}, \texttt{ncl}, \texttt{ferret}, etc.) expect the data to be in one file, these output files have to be merged to one single file after PALM has finished.\\
	485	\ \\
	486	This is done within the job by calling the utility program \texttt{combine\underline{\ }plot\underline{\ }fields.x} after PALM has successfully finished.
	487	\item<3-> \texttt{combine\underline{\ }plot\underline{\ }fields.x} is automatically executed by \texttt{mrun}.
	488	\item<4-> The executable \texttt{combine\underline{\ }plot\underline{\ }fields.x} is created during the installation process by the command\\
	489	\ \\
	490	\quad \texttt{mbuild -u -h <host identifier>}
	491	\end{itemize}
	492	\end{frame}
	493
	494	% Folie 14
	495	\begin{frame}
	496	\frametitle{Performance Examples (I)}
	497	\begin{itemize}
	498	\item Simulation using 1536 * 768 * 242 grid points ($\sim$ 60 GByte)
	499	\end{itemize}
	500	\includegraphics[scale=0.28]{parallelization_figures/perf_left.png}
	501	\includegraphics[scale=0.28]{parallelization_figures/perf_right.png}
	502	\includegraphics[scale=0.28]{parallelization_figures/legende.png} \\
	503	\scriptsize
	504	\begin{columns}[T]
	505	\begin{column}{0.18\textwidth}
	506	\end{column}
	507	\begin{column}{0.4\textwidth}
	508	IBM-Regatta, HLRN, Hannover\\
	509	(1D domain decomposition)
	510	\end{column}
	511	\begin{column}{0.4\textwidth}
	512	Sun Fire X4600, Tokyo Institute of Technology\\
	513	(2D domain decomposition)
	514	\end{column}
	515	\begin{column}{0.2\textwidth}
	516	\end{column}
	517
	518	\end{columns}
	519
	520	\end{frame}
	521
	522	% Folie 15
	523	\begin{frame}
	524	\frametitle{Performance Examples (II)}
	525	\begin{itemize}
	526	\item Simulation with $2048^3$ grid points ($\sim$ 2 TByte memory)
	527	\end{itemize}
	528	\begin{columns}[T]
	529	\begin{column}{0.5\textwidth}
	530	\includegraphics[scale=0.25]{parallelization_figures/perf_3.png} \\
	531	\scriptsize
	532	\quad SGI-ICE2, HLRN-II, Hannover\\
	533	\quad (2D-domain decomposition)
	534	\end{column}
	535	\begin{column}{0.5\textwidth}
	536	\vspace{35mm}
	537	\onslide<2-> currently largest simulation feasible on that system:\\
	538	\ \\
	539	$4096^3$ grid points
	540	\end{column}
	541	\end{columns}
	542	\end{frame}
	543
	544	\end{document}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |