Ignore:
Timestamp:
Sep 18, 2013 1:19:19 PM (11 years ago)
Author:
fuhrmann
Message:

several updates in the tutorial

File:
1 edited

Legend:

Unmodified
Added
Removed
  • palm/trunk/TUTORIAL/SOURCE/parallelization.tex

    r973 r1226  
    1515\usepackage{tikz}
    1616\usetikzlibrary{shapes,arrows,positioning}
    17 \usetikzlibrary{decorations.markings}             %neues paket
    18 \usetikzlibrary{decorations.pathreplacing}        %neues paket
     17%\usetikzlibrary{decorations.markings}             %neues paket
     18%\usetikzlibrary{decorations.pathreplacing}        %neues paket
    1919\def\Tiny{\fontsize{4pt}{4pt}\selectfont}
    2020\usepackage{amsmath}
     
    7676         \onslide<5->each PE solves the equations for a different subdomain of the total domain
    7777         \begin{center}
    78             \includegraphics[width=0.5\textwidth]{parallelization_figures/subdomain.png}
     78            \includegraphics[width=0.3\textwidth]{parallelization_figures/subdomain_folie2.png}
    7979         \end{center}
    8080         \onslide<7->each PE only knows the variable values from its subdomain, communication / data exchange between PEs is necessary\\
     
    109109         \node (center) at (0,1) {};
    110110         \onslide<2-> \node (Network) at (-3.5,1) [draw, ellipse,fill=green!20] {Network};
    111          \node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-T3E)};
     111         \node (dis_mem) at (-3.5,-1) [text width=0.28\textwidth] {\footnotesize \textbf{distributed} memory\\(Cray-XC30)};
    112112         \onslide<3-> \node (add_mem) at (3.5,1) [rectangle, draw] {adressable memory};
    113113         \node (sha_mem) at (3.5,-1) [text width=0.35\textwidth] {\footnotesize \textbf{shared} memory\\(SGI-Altix, multicore PCs)};
     
    116116         \onslide<6-> \node (clustered_systems) at (0,-3) [draw, text width=0.15\textwidth] {clustered systems};
    117117         \node (cs_info) at (0,-4.2) [text width=0.4\textwidth] {\footnotesize (IBM-Regatta, Linux-Cluster,
    118 NEC-SX, SGI-ICE, Cray-XE6)};
     118            NEC-SX, SGI-ICE, Cray-XC)};
    119119
    120120% Adressable memory node (big)
     
    248248   \vspace{2mm}
    249249   \begin{itemize}
    250       \item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used in case of slow networks, but this generally doesn't scale for processor numbers $>$ 256.
     250      \item<10-> Alternatively, a 1D-decomposition along $x$ or $y$ may be used.
    251251      \vspace{2mm}
    252252      \item<11-> Message passing is realized using MPI.
    253253      \vspace{2mm}
    254254      \item<12-> OpenMP parallelization as well as mixed usage of OpenMP and
    255 MPI is also possible. (OpenMP tests and optimization is under way)
     255                    MPI is also possible.
    256256   \end{itemize}
    257257\end{frame}
     
    285285         \onslide<5-> \includegraphics[width=0.8\textwidth]{parallelization_figures/fft.png} \end{center}
    286286         \vspace{-4mm}
    287          \textbf{Example: transpositions for solving the Poisson\\ \hspace{4em}equation}
     287         \textbf{Example: transpositions for solving the Poisson\\ \hspace{4.1em}equation}
    288288      \end{column}
    289289   \end{columns}   
     
    302302                 \item<3-> If a normal unix-kernel operating system (not a micro-kernel) is running on each CPU, then there migth be a speed-up of the code, if 1-2 PEs less than the total number of PEs on the node are used.
    303303                 \item<4-> On machines with a comparably slow network, a 1D-decomposition (along $x$) should be used, because then only two transpositions have to be carried out by the pressure solver. A 1D-decomposition is automatically used for NEC-machines (e.g.  \texttt{-h necriam}). The virtual processor grid to be used can be set manually by d3par-parameters \texttt{npex} and \texttt{npey}.
    304             \item<6-> Using the Open-MP parallelization does not yield any advantage over using a pure domain decomposition with MPI (contrary to expectations, it mostly slows down the computational speed), but this may change on cluster systems for very large number of processors ($>$10000?).\\       
     304            \item<5-> Using the Open-MP parallelization does not yield any advantage over using a pure domain decomposition with MPI (contrary to expectations, it mostly slows down the computational speed), but this may change on cluster systems for very large number of processors ($>$10000?).\\       
    305305         \end{itemize}
    306306         \begin{center}
    307307         \vspace{-7mm}
    308          \onslide<5-> \includegraphics[width=0.13\textwidth]{parallelization_figures/folie_6.png}
     308         \onslide<4-> \includegraphics[width=0.13\textwidth]{parallelization_figures/folie_6.png}
    309309         \end{center}
    310310      \end{column}
     
    325325                 \quad \texttt{\%modules   ...:mpt:...}
    326326            \vspace{2mm}
    327                  \item<2-> The path to the MPI-library may have to be given in the compiler call, by setting an appropriate option in the configuration file .mrun.config:
    328 
    329                  \quad \texttt{\%lopts  -axW:-cpp:-r8:-nbs:-Vaxlib:\textcolor{blue}{-L:<replace by mpi library path>:-lmpi}}
     327                 \item<3-> The path to the MPI-library may have to be given in the compiler call, by setting an appropriate option in the configuration file .mrun.config:
     328
     329                 \quad \texttt{\%lopts  -r8:-nbs:\textcolor{blue}{-L:<replace by mpi library path>:-lmpi}}
    330330            \vspace{2mm}
    331                  \item<3-> All MPI calls must be within\\
     331                 \item<4-> All MPI calls must be within\\
    332332                 \quad \texttt{CALL MPI\_INIT( ierror )}\\
    333333                 \quad $\vdots$\\
     
    417417            \quad \texttt{CALL MPI\underline{\ }TYPE\underline{\ }COMMIT( type\underline{\ }yz(0), ierr )   ! see file init\underline{\ }pegrid.f90}\\
    418418            \ \\
    419             \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-ngl,nxl), type\underline{\ }yz(grid\underline{\ }level), MPI\underline{\ }REAL, pleft, 0, ...}\\
     419            \quad \texttt{CALL MPI\underline{\ }SENDRECV( ar(nzb,nys-ngl,nxl), 1, type\underline{\ }yz(grid\underline{\ }level), pleft, 0, ...}\\
    420420         \end{itemize}       
    421421\end{frame}
     
    468468         \onslide<4-> \textbf{General comment:}
    469469         \begin{itemize}
    470             \item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-options)
     470            \item Parallel I/O on a large number of files ($>$1000) currently may cause severe file system problems (e.g. on Lustre file systems).\\ \textbf{Workaround:} reduce the maximum number of parallel I/O streams\\ \hspace{5.75em}(see \texttt{mrun}-option \texttt{-w})
    471471         \end{itemize}
    472472      \end{column}
     
    476476
    477477
    478 % Folie 13
     478%Folie 13
    479479\begin{frame}
    480480   \frametitle{PALM Parallel I/O for 2D/3D Data}
     
    492492\end{frame}
    493493
    494 % Folie 14
     494%Folie 14
    495495\begin{frame}
    496496   \frametitle{Performance Examples (I)}
     
    520520\end{frame}
    521521
    522 % Folie 15
     522%Folie 15
    523523\begin{frame}
    524524   \frametitle{Performance Examples (II)}
     
    542542\end{frame}
    543543
     544%Folie 16
     545\begin{frame}
     546   \frametitle{Performance Examples (III)}
     547   \begin{itemize}
     548      \item Simulation with $2160^3$ grid points  ($\sim$ 2 TByte memory)
     549   \end{itemize}
     550      \begin{columns}[T]
     551         \begin{column}{0.5\textwidth}
     552            \includegraphics[scale=0.3]{parallelization_figures/perf_4.png} \\
     553            \scriptsize
     554            \quad Cray-XC30, HLRN-III, Hannover\\
     555            \quad (2D-domain decomposition)
     556         \end{column}
     557         \begin{column}{0.5\textwidth}
     558            \vspace{35mm}
     559            \onslide<2-> currently largest simulation feasible on that system:\\
     560            \ \\
     561            $5600^3$ grid points
     562         \end{column}
     563      \end{columns}
     564\end{frame}
     565
    544566\end{document}
Note: See TracChangeset for help on using the changeset viewer.