[1682] | 1 | !> @file transpose.f90 |
---|
[2000] | 2 | !------------------------------------------------------------------------------! |
---|
[2696] | 3 | ! This file is part of the PALM model system. |
---|
[1036] | 4 | ! |
---|
[2000] | 5 | ! PALM is free software: you can redistribute it and/or modify it under the |
---|
| 6 | ! terms of the GNU General Public License as published by the Free Software |
---|
| 7 | ! Foundation, either version 3 of the License, or (at your option) any later |
---|
| 8 | ! version. |
---|
[1036] | 9 | ! |
---|
| 10 | ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY |
---|
| 11 | ! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR |
---|
| 12 | ! A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
---|
| 13 | ! |
---|
| 14 | ! You should have received a copy of the GNU General Public License along with |
---|
| 15 | ! PALM. If not, see <http://www.gnu.org/licenses/>. |
---|
| 16 | ! |
---|
[2718] | 17 | ! Copyright 1997-2018 Leibniz Universitaet Hannover |
---|
[2000] | 18 | !------------------------------------------------------------------------------! |
---|
[1036] | 19 | ! |
---|
[484] | 20 | ! Current revisions: |
---|
[1] | 21 | ! ----------------- |
---|
[1321] | 22 | ! |
---|
[2119] | 23 | ! |
---|
[1321] | 24 | ! Former revisions: |
---|
| 25 | ! ----------------- |
---|
| 26 | ! $Id: transpose.f90 3634 2018-12-18 12:31:28Z kanani $ |
---|
[3634] | 27 | ! OpenACC port for SPEC |
---|
| 28 | ! |
---|
| 29 | ! 3241 2018-09-12 15:02:00Z raasch |
---|
[3241] | 30 | ! unused variables removed |
---|
| 31 | ! |
---|
| 32 | ! 2718 2018-01-02 08:49:38Z maronga |
---|
[2716] | 33 | ! Corrected "Former revisions" section |
---|
| 34 | ! |
---|
| 35 | ! 2696 2017-12-14 17:12:51Z kanani |
---|
| 36 | ! Change in file header (GPL part) |
---|
[1321] | 37 | ! |
---|
[2716] | 38 | ! 2119 2017-01-17 16:51:50Z raasch |
---|
| 39 | ! |
---|
[2119] | 40 | ! 2118 2017-01-17 16:38:49Z raasch |
---|
| 41 | ! OpenACC directives removed |
---|
| 42 | ! |
---|
[2001] | 43 | ! 2000 2016-08-20 18:09:15Z knoop |
---|
| 44 | ! Forced header and separation lines into 80 columns |
---|
| 45 | ! |
---|
[1683] | 46 | ! 1682 2015-10-07 23:56:08Z knoop |
---|
| 47 | ! Code annotations made doxygen readable |
---|
| 48 | ! |
---|
[1325] | 49 | ! 1324 2014-03-21 09:13:16Z suehring |
---|
| 50 | ! Bugfix: ONLY statement for module pegrid removed |
---|
| 51 | ! |
---|
[1321] | 52 | ! 1320 2014-03-20 08:40:49Z raasch |
---|
[1320] | 53 | ! ONLY-attribute added to USE-statements, |
---|
| 54 | ! kind-parameters added to all INTEGER and REAL declaration statements, |
---|
| 55 | ! kinds are defined in new module kinds, |
---|
| 56 | ! old module precision_kind is removed, |
---|
| 57 | ! revision history before 2012 removed, |
---|
| 58 | ! comment fields (!:) to be used for variable explanations added to |
---|
| 59 | ! all variable declaration statements |
---|
[198] | 60 | ! |
---|
[1319] | 61 | ! 1318 2014-03-17 13:35:16Z raasch |
---|
| 62 | ! cpu_log_nowait parameter added to cpu measurements of the transpositions |
---|
| 63 | ! required for solving the Poisson equation (poisfft), |
---|
| 64 | ! module interfaces removed |
---|
| 65 | ! |
---|
[1258] | 66 | ! 1257 2013-11-08 15:18:40Z raasch |
---|
| 67 | ! openacc loop and loop vector clauses removed |
---|
| 68 | ! |
---|
[1217] | 69 | ! 1216 2013-08-26 09:31:42Z raasch |
---|
| 70 | ! re-sorting of the transposed / to be transposed arrays moved to separate |
---|
| 71 | ! routines resort_for_... |
---|
| 72 | ! |
---|
[1112] | 73 | ! 1111 2013-03-08 23:54:10Z raasch |
---|
| 74 | ! openACC directives added, |
---|
| 75 | ! resorting data from/to work changed, work got 4 dimensions instead of 1 |
---|
| 76 | ! |
---|
[1107] | 77 | ! 1106 2013-03-04 05:31:38Z raasch |
---|
| 78 | ! preprocessor lines rearranged so that routines can also be used in serial |
---|
| 79 | ! (non-parallel) mode |
---|
| 80 | ! |
---|
[1093] | 81 | ! 1092 2013-02-02 11:24:22Z raasch |
---|
| 82 | ! unused variables removed |
---|
| 83 | ! |
---|
[1037] | 84 | ! 1036 2012-10-22 13:43:42Z raasch |
---|
| 85 | ! code put under GPL (PALM 3.9) |
---|
| 86 | ! |
---|
[1004] | 87 | ! 1003 2012-09-14 14:35:53Z raasch |
---|
| 88 | ! indices nxa, nya, etc. replaced by nx, ny, etc. |
---|
| 89 | ! |
---|
[1] | 90 | ! Revision 1.1 1997/07/24 11:25:18 raasch |
---|
| 91 | ! Initial revision |
---|
| 92 | ! |
---|
[1216] | 93 | !------------------------------------------------------------------------------! |
---|
| 94 | ! Description: |
---|
| 95 | ! ------------ |
---|
[1682] | 96 | !> Resorting data for the transposition from x to y. The transposition itself |
---|
| 97 | !> is carried out in transpose_xy |
---|
[1216] | 98 | !------------------------------------------------------------------------------! |
---|
[1682] | 99 | SUBROUTINE resort_for_xy( f_in, f_inv ) |
---|
| 100 | |
---|
[1216] | 101 | |
---|
[1320] | 102 | USE indices, & |
---|
| 103 | ONLY: nx |
---|
[1216] | 104 | |
---|
[1320] | 105 | USE kinds |
---|
| 106 | |
---|
| 107 | USE transpose_indices, & |
---|
[3241] | 108 | ONLY: nyn_x, nys_x, nzb_x, nzt_x |
---|
[1320] | 109 | |
---|
[1216] | 110 | IMPLICIT NONE |
---|
| 111 | |
---|
[1682] | 112 | REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
| 113 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
[1216] | 114 | |
---|
| 115 | |
---|
[1682] | 116 | INTEGER(iwp) :: i !< |
---|
| 117 | INTEGER(iwp) :: j !< |
---|
| 118 | INTEGER(iwp) :: k !< |
---|
[1] | 119 | ! |
---|
[1216] | 120 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 121 | !-- by MPI contiguous |
---|
| 122 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 123 | !$OMP DO |
---|
[3634] | 124 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 125 | !$ACC PRESENT(f_inv, f_in) |
---|
[1216] | 126 | DO i = 0, nx |
---|
| 127 | DO k = nzb_x, nzt_x |
---|
| 128 | DO j = nys_x, nyn_x |
---|
| 129 | f_inv(j,k,i) = f_in(i,j,k) |
---|
| 130 | ENDDO |
---|
| 131 | ENDDO |
---|
| 132 | ENDDO |
---|
| 133 | !$OMP END PARALLEL |
---|
| 134 | |
---|
| 135 | END SUBROUTINE resort_for_xy |
---|
| 136 | |
---|
| 137 | |
---|
| 138 | !------------------------------------------------------------------------------! |
---|
[1] | 139 | ! Description: |
---|
| 140 | ! ------------ |
---|
[1682] | 141 | !> Transposition of input array (f_in) from x to y. For the input array, all |
---|
| 142 | !> elements along x reside on the same PE, while after transposition, all |
---|
| 143 | !> elements along y reside on the same PE. |
---|
[1] | 144 | !------------------------------------------------------------------------------! |
---|
[1682] | 145 | SUBROUTINE transpose_xy( f_inv, f_out ) |
---|
[1] | 146 | |
---|
[1682] | 147 | |
---|
[1320] | 148 | USE cpulog, & |
---|
| 149 | ONLY: cpu_log, cpu_log_nowait, log_point_s |
---|
| 150 | |
---|
| 151 | USE indices, & |
---|
| 152 | ONLY: nx, ny |
---|
| 153 | |
---|
| 154 | USE kinds |
---|
| 155 | |
---|
[1] | 156 | USE pegrid |
---|
| 157 | |
---|
[1320] | 158 | USE transpose_indices, & |
---|
| 159 | ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y |
---|
| 160 | |
---|
[1] | 161 | IMPLICIT NONE |
---|
| 162 | |
---|
[1682] | 163 | INTEGER(iwp) :: i !< |
---|
| 164 | INTEGER(iwp) :: j !< |
---|
| 165 | INTEGER(iwp) :: k !< |
---|
| 166 | INTEGER(iwp) :: l !< |
---|
| 167 | INTEGER(iwp) :: ys !< |
---|
[1320] | 168 | |
---|
[1682] | 169 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
| 170 | REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
[1] | 171 | |
---|
[1682] | 172 | REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< |
---|
[3634] | 173 | !$ACC DECLARE CREATE(work) |
---|
[1111] | 174 | |
---|
| 175 | |
---|
[1106] | 176 | IF ( numprocs /= 1 ) THEN |
---|
| 177 | |
---|
| 178 | #if defined( __parallel ) |
---|
[1] | 179 | ! |
---|
[1106] | 180 | !-- Transpose array |
---|
[1318] | 181 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3634] | 182 | !$ACC UPDATE HOST(f_inv) |
---|
[1106] | 183 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1111] | 184 | CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & |
---|
| 185 | work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & |
---|
[1106] | 186 | comm1dy, ierr ) |
---|
[3634] | 187 | !$ACC UPDATE DEVICE(work) |
---|
[1106] | 188 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1] | 189 | |
---|
| 190 | ! |
---|
[1106] | 191 | !-- Reorder transposed array |
---|
[1111] | 192 | !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) |
---|
[683] | 193 | !$OMP DO |
---|
[1106] | 194 | DO l = 0, pdims(2) - 1 |
---|
| 195 | ys = 0 + l * ( nyn_x - nys_x + 1 ) |
---|
[3634] | 196 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 197 | !$ACC PRESENT(f_out, work) |
---|
[1106] | 198 | DO i = nxl_y, nxr_y |
---|
| 199 | DO k = nzb_y, nzt_y |
---|
| 200 | DO j = ys, ys + nyn_x - nys_x |
---|
[1111] | 201 | f_out(j,i,k) = work(j-ys+1,k,i,l) |
---|
[1106] | 202 | ENDDO |
---|
[1] | 203 | ENDDO |
---|
| 204 | ENDDO |
---|
| 205 | ENDDO |
---|
[683] | 206 | !$OMP END PARALLEL |
---|
[1] | 207 | #endif |
---|
| 208 | |
---|
[1106] | 209 | ELSE |
---|
| 210 | |
---|
| 211 | ! |
---|
| 212 | !-- Reorder transposed array |
---|
| 213 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 214 | !$OMP DO |
---|
[3634] | 215 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 216 | !$ACC PRESENT(f_out, f_inv) |
---|
[1106] | 217 | DO k = nzb_y, nzt_y |
---|
| 218 | DO i = nxl_y, nxr_y |
---|
| 219 | DO j = 0, ny |
---|
| 220 | f_out(j,i,k) = f_inv(j,k,i) |
---|
| 221 | ENDDO |
---|
| 222 | ENDDO |
---|
| 223 | ENDDO |
---|
| 224 | !$OMP END PARALLEL |
---|
| 225 | |
---|
| 226 | ENDIF |
---|
| 227 | |
---|
[1] | 228 | END SUBROUTINE transpose_xy |
---|
| 229 | |
---|
| 230 | |
---|
| 231 | !------------------------------------------------------------------------------! |
---|
| 232 | ! Description: |
---|
| 233 | ! ------------ |
---|
[1682] | 234 | !> Resorting data after the transposition from x to z. The transposition itself |
---|
| 235 | !> is carried out in transpose_xz |
---|
[1216] | 236 | !------------------------------------------------------------------------------! |
---|
[1682] | 237 | SUBROUTINE resort_for_xz( f_inv, f_out ) |
---|
[1216] | 238 | |
---|
[1682] | 239 | |
---|
[1320] | 240 | USE indices, & |
---|
| 241 | ONLY: nxl, nxr, nyn, nys, nz |
---|
[1216] | 242 | |
---|
[1320] | 243 | USE kinds |
---|
| 244 | |
---|
[1216] | 245 | IMPLICIT NONE |
---|
| 246 | |
---|
[1682] | 247 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
| 248 | REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !< |
---|
[1216] | 249 | |
---|
[1682] | 250 | INTEGER(iwp) :: i !< |
---|
| 251 | INTEGER(iwp) :: j !< |
---|
| 252 | INTEGER(iwp) :: k !< |
---|
[1216] | 253 | ! |
---|
| 254 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 255 | !-- by MPI contiguous. |
---|
| 256 | !-- In case of parallel fft/transposition, scattered store is faster in |
---|
| 257 | !-- backward direction!!! |
---|
| 258 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 259 | !$OMP DO |
---|
[3634] | 260 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 261 | !$ACC PRESENT(f_out, f_inv) |
---|
[1216] | 262 | DO k = 1, nz |
---|
| 263 | DO i = nxl, nxr |
---|
| 264 | DO j = nys, nyn |
---|
| 265 | f_out(k,j,i) = f_inv(j,i,k) |
---|
| 266 | ENDDO |
---|
| 267 | ENDDO |
---|
| 268 | ENDDO |
---|
| 269 | !$OMP END PARALLEL |
---|
| 270 | |
---|
| 271 | END SUBROUTINE resort_for_xz |
---|
| 272 | |
---|
| 273 | |
---|
| 274 | !------------------------------------------------------------------------------! |
---|
| 275 | ! Description: |
---|
| 276 | ! ------------ |
---|
[1682] | 277 | !> Transposition of input array (f_in) from x to z. For the input array, all |
---|
| 278 | !> elements along x reside on the same PE, while after transposition, all |
---|
| 279 | !> elements along z reside on the same PE. |
---|
[1] | 280 | !------------------------------------------------------------------------------! |
---|
[1682] | 281 | SUBROUTINE transpose_xz( f_in, f_inv ) |
---|
[1] | 282 | |
---|
[1682] | 283 | |
---|
[1320] | 284 | USE cpulog, & |
---|
| 285 | ONLY: cpu_log, cpu_log_nowait, log_point_s |
---|
[1] | 286 | |
---|
[1320] | 287 | USE indices, & |
---|
[3241] | 288 | ONLY: nnx, nx, nxl, nxr, nyn, nys, nz |
---|
[1320] | 289 | |
---|
| 290 | USE kinds |
---|
| 291 | |
---|
[1324] | 292 | USE pegrid |
---|
[1320] | 293 | |
---|
| 294 | USE transpose_indices, & |
---|
| 295 | ONLY: nyn_x, nys_x, nzb_x, nzt_x |
---|
| 296 | |
---|
[1] | 297 | IMPLICIT NONE |
---|
| 298 | |
---|
[1682] | 299 | INTEGER(iwp) :: i !< |
---|
| 300 | INTEGER(iwp) :: j !< |
---|
| 301 | INTEGER(iwp) :: k !< |
---|
| 302 | INTEGER(iwp) :: l !< |
---|
| 303 | INTEGER(iwp) :: xs !< |
---|
[1] | 304 | |
---|
[1682] | 305 | REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
| 306 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
[1] | 307 | |
---|
[1682] | 308 | REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< |
---|
[3634] | 309 | !$ACC DECLARE CREATE(work) |
---|
[1111] | 310 | |
---|
[1320] | 311 | |
---|
[1] | 312 | ! |
---|
| 313 | !-- If the PE grid is one-dimensional along y, the array has only to be |
---|
| 314 | !-- reordered locally and therefore no transposition has to be done. |
---|
| 315 | IF ( pdims(1) /= 1 ) THEN |
---|
[1106] | 316 | |
---|
| 317 | #if defined( __parallel ) |
---|
[1] | 318 | ! |
---|
| 319 | !-- Reorder input array for transposition |
---|
[1111] | 320 | !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) |
---|
[683] | 321 | !$OMP DO |
---|
[1] | 322 | DO l = 0, pdims(1) - 1 |
---|
| 323 | xs = 0 + l * nnx |
---|
[3634] | 324 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 325 | !$ACC PRESENT(work, f_in) |
---|
[1003] | 326 | DO k = nzb_x, nzt_x |
---|
[164] | 327 | DO i = xs, xs + nnx - 1 |
---|
[1003] | 328 | DO j = nys_x, nyn_x |
---|
[1111] | 329 | work(j,i-xs+1,k,l) = f_in(i,j,k) |
---|
[1] | 330 | ENDDO |
---|
| 331 | ENDDO |
---|
| 332 | ENDDO |
---|
| 333 | ENDDO |
---|
[683] | 334 | !$OMP END PARALLEL |
---|
[1] | 335 | |
---|
| 336 | ! |
---|
| 337 | !-- Transpose array |
---|
[1318] | 338 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3634] | 339 | !$ACC UPDATE HOST(work) |
---|
[622] | 340 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1111] | 341 | CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & |
---|
| 342 | f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & |
---|
[1] | 343 | comm1dx, ierr ) |
---|
[3634] | 344 | !$ACC UPDATE DEVICE(f_inv) |
---|
[1] | 345 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1106] | 346 | #endif |
---|
| 347 | |
---|
[1] | 348 | ELSE |
---|
[1106] | 349 | |
---|
[1] | 350 | ! |
---|
| 351 | !-- Reorder the array in a way that the z index is in first position |
---|
[683] | 352 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 353 | !$OMP DO |
---|
[3634] | 354 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 355 | !$ACC PRESENT(f_inv, f_in) |
---|
[1003] | 356 | DO i = nxl, nxr |
---|
| 357 | DO j = nys, nyn |
---|
| 358 | DO k = 1, nz |
---|
[164] | 359 | f_inv(j,i,k) = f_in(i,j,k) |
---|
[1] | 360 | ENDDO |
---|
| 361 | ENDDO |
---|
| 362 | ENDDO |
---|
[683] | 363 | !$OMP END PARALLEL |
---|
[1] | 364 | |
---|
[164] | 365 | ENDIF |
---|
| 366 | |
---|
[1] | 367 | END SUBROUTINE transpose_xz |
---|
| 368 | |
---|
| 369 | |
---|
| 370 | !------------------------------------------------------------------------------! |
---|
| 371 | ! Description: |
---|
| 372 | ! ------------ |
---|
[1682] | 373 | !> Resorting data after the transposition from y to x. The transposition itself |
---|
| 374 | !> is carried out in transpose_yx |
---|
[1216] | 375 | !------------------------------------------------------------------------------! |
---|
[1682] | 376 | SUBROUTINE resort_for_yx( f_inv, f_out ) |
---|
[1216] | 377 | |
---|
[1682] | 378 | |
---|
[1320] | 379 | USE indices, & |
---|
| 380 | ONLY: nx |
---|
[1216] | 381 | |
---|
[1320] | 382 | USE kinds |
---|
| 383 | |
---|
| 384 | USE transpose_indices, & |
---|
| 385 | ONLY: nyn_x, nys_x, nzb_x, nzt_x |
---|
| 386 | |
---|
[1216] | 387 | IMPLICIT NONE |
---|
| 388 | |
---|
[1682] | 389 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
| 390 | REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
[1216] | 391 | |
---|
| 392 | |
---|
[1682] | 393 | INTEGER(iwp) :: i !< |
---|
| 394 | INTEGER(iwp) :: j !< |
---|
| 395 | INTEGER(iwp) :: k !< |
---|
[1216] | 396 | ! |
---|
| 397 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 398 | !-- by MPI contiguous |
---|
| 399 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 400 | !$OMP DO |
---|
[3634] | 401 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 402 | !$ACC PRESENT(f_out, f_inv) |
---|
[1216] | 403 | DO i = 0, nx |
---|
| 404 | DO k = nzb_x, nzt_x |
---|
| 405 | DO j = nys_x, nyn_x |
---|
| 406 | f_out(i,j,k) = f_inv(j,k,i) |
---|
| 407 | ENDDO |
---|
| 408 | ENDDO |
---|
| 409 | ENDDO |
---|
| 410 | !$OMP END PARALLEL |
---|
| 411 | |
---|
| 412 | END SUBROUTINE resort_for_yx |
---|
| 413 | |
---|
| 414 | |
---|
| 415 | !------------------------------------------------------------------------------! |
---|
| 416 | ! Description: |
---|
| 417 | ! ------------ |
---|
[1682] | 418 | !> Transposition of input array (f_in) from y to x. For the input array, all |
---|
| 419 | !> elements along y reside on the same PE, while after transposition, all |
---|
| 420 | !> elements along x reside on the same PE. |
---|
[1] | 421 | !------------------------------------------------------------------------------! |
---|
[1682] | 422 | SUBROUTINE transpose_yx( f_in, f_inv ) |
---|
[1] | 423 | |
---|
[1682] | 424 | |
---|
[1320] | 425 | USE cpulog, & |
---|
| 426 | ONLY: cpu_log, cpu_log_nowait, log_point_s |
---|
[1] | 427 | |
---|
[1320] | 428 | USE indices, & |
---|
| 429 | ONLY: nx, ny |
---|
| 430 | |
---|
| 431 | USE kinds |
---|
| 432 | |
---|
[1324] | 433 | USE pegrid |
---|
[1320] | 434 | |
---|
| 435 | USE transpose_indices, & |
---|
| 436 | ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y |
---|
| 437 | |
---|
[1] | 438 | IMPLICIT NONE |
---|
| 439 | |
---|
[1682] | 440 | INTEGER(iwp) :: i !< |
---|
| 441 | INTEGER(iwp) :: j !< |
---|
| 442 | INTEGER(iwp) :: k !< |
---|
| 443 | INTEGER(iwp) :: l !< |
---|
| 444 | INTEGER(iwp) :: ys !< |
---|
[1] | 445 | |
---|
[1682] | 446 | REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
| 447 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
[1111] | 448 | |
---|
[1682] | 449 | REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< |
---|
[3634] | 450 | !$ACC DECLARE CREATE(work) |
---|
[1111] | 451 | |
---|
[1320] | 452 | |
---|
[1106] | 453 | IF ( numprocs /= 1 ) THEN |
---|
| 454 | |
---|
[1] | 455 | #if defined( __parallel ) |
---|
| 456 | ! |
---|
[1106] | 457 | !-- Reorder input array for transposition |
---|
[1111] | 458 | !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) |
---|
[683] | 459 | !$OMP DO |
---|
[1106] | 460 | DO l = 0, pdims(2) - 1 |
---|
| 461 | ys = 0 + l * ( nyn_x - nys_x + 1 ) |
---|
[3634] | 462 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 463 | !$ACC PRESENT(work, f_in) |
---|
[1106] | 464 | DO i = nxl_y, nxr_y |
---|
| 465 | DO k = nzb_y, nzt_y |
---|
| 466 | DO j = ys, ys + nyn_x - nys_x |
---|
[1111] | 467 | work(j-ys+1,k,i,l) = f_in(j,i,k) |
---|
[1106] | 468 | ENDDO |
---|
| 469 | ENDDO |
---|
| 470 | ENDDO |
---|
| 471 | ENDDO |
---|
| 472 | !$OMP END PARALLEL |
---|
| 473 | |
---|
| 474 | ! |
---|
| 475 | !-- Transpose array |
---|
[1318] | 476 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3634] | 477 | !$ACC UPDATE HOST(work) |
---|
[1106] | 478 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1111] | 479 | CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & |
---|
| 480 | f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & |
---|
[1106] | 481 | comm1dy, ierr ) |
---|
[3634] | 482 | !$ACC UPDATE DEVICE(f_inv) |
---|
[1106] | 483 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
| 484 | #endif |
---|
| 485 | |
---|
| 486 | ELSE |
---|
| 487 | |
---|
| 488 | ! |
---|
| 489 | !-- Reorder array f_in the same way as ALLTOALL did it |
---|
| 490 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 491 | !$OMP DO |
---|
[3634] | 492 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 493 | !$ACC PRESENT(f_inv, f_in) |
---|
[1003] | 494 | DO i = nxl_y, nxr_y |
---|
| 495 | DO k = nzb_y, nzt_y |
---|
[1106] | 496 | DO j = 0, ny |
---|
| 497 | f_inv(j,k,i) = f_in(j,i,k) |
---|
[1] | 498 | ENDDO |
---|
| 499 | ENDDO |
---|
| 500 | ENDDO |
---|
[683] | 501 | !$OMP END PARALLEL |
---|
[1] | 502 | |
---|
[1106] | 503 | ENDIF |
---|
[1] | 504 | |
---|
| 505 | END SUBROUTINE transpose_yx |
---|
| 506 | |
---|
| 507 | |
---|
| 508 | !------------------------------------------------------------------------------! |
---|
| 509 | ! Description: |
---|
| 510 | ! ------------ |
---|
[1682] | 511 | !> Transposition of input array (f_in) from y to x. For the input array, all |
---|
| 512 | !> elements along y reside on the same PE, while after transposition, all |
---|
| 513 | !> elements along x reside on the same PE. |
---|
| 514 | !> This is a direct transposition for arrays with indices in regular order |
---|
| 515 | !> (k,j,i) (cf. transpose_yx). |
---|
[1] | 516 | !------------------------------------------------------------------------------! |
---|
[1682] | 517 | SUBROUTINE transpose_yxd( f_in, f_out ) |
---|
[1] | 518 | |
---|
[1682] | 519 | |
---|
[1320] | 520 | USE cpulog, & |
---|
[3241] | 521 | ONLY: cpu_log, log_point_s |
---|
[1] | 522 | |
---|
[1320] | 523 | USE indices, & |
---|
| 524 | ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz |
---|
| 525 | |
---|
| 526 | USE kinds |
---|
| 527 | |
---|
[1324] | 528 | USE pegrid |
---|
[1320] | 529 | |
---|
| 530 | USE transpose_indices, & |
---|
| 531 | ONLY: nyn_x, nys_x, nzb_x, nzt_x |
---|
| 532 | |
---|
[1] | 533 | IMPLICIT NONE |
---|
| 534 | |
---|
[1682] | 535 | INTEGER(iwp) :: i !< |
---|
| 536 | INTEGER(iwp) :: j !< |
---|
| 537 | INTEGER(iwp) :: k !< |
---|
| 538 | INTEGER(iwp) :: l !< |
---|
| 539 | INTEGER(iwp) :: m !< |
---|
| 540 | INTEGER(iwp) :: xs !< |
---|
[1] | 541 | |
---|
[1682] | 542 | REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< |
---|
| 543 | REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !< |
---|
| 544 | REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
| 545 | REAL(wp) :: work(nnx*nny*nnz) !< |
---|
[1] | 546 | #if defined( __parallel ) |
---|
| 547 | |
---|
| 548 | ! |
---|
| 549 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 550 | !-- by MPI contiguous |
---|
[1003] | 551 | DO k = 1, nz |
---|
| 552 | DO j = nys, nyn |
---|
| 553 | DO i = nxl, nxr |
---|
[164] | 554 | f_inv(i,k,j) = f_in(k,j,i) |
---|
[1] | 555 | ENDDO |
---|
| 556 | ENDDO |
---|
| 557 | ENDDO |
---|
| 558 | |
---|
| 559 | ! |
---|
| 560 | !-- Transpose array |
---|
| 561 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) |
---|
[622] | 562 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1] | 563 | CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, & |
---|
[164] | 564 | work(1), sendrecvcount_xy, MPI_REAL, & |
---|
[1] | 565 | comm1dx, ierr ) |
---|
| 566 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
| 567 | |
---|
| 568 | ! |
---|
| 569 | !-- Reorder transposed array |
---|
| 570 | m = 0 |
---|
| 571 | DO l = 0, pdims(1) - 1 |
---|
| 572 | xs = 0 + l * nnx |
---|
[1003] | 573 | DO j = nys_x, nyn_x |
---|
| 574 | DO k = 1, nz |
---|
[1] | 575 | DO i = xs, xs + nnx - 1 |
---|
| 576 | m = m + 1 |
---|
[164] | 577 | f_out(i,j,k) = work(m) |
---|
[1] | 578 | ENDDO |
---|
| 579 | ENDDO |
---|
| 580 | ENDDO |
---|
| 581 | ENDDO |
---|
| 582 | |
---|
| 583 | #endif |
---|
| 584 | |
---|
| 585 | END SUBROUTINE transpose_yxd |
---|
| 586 | |
---|
| 587 | |
---|
| 588 | !------------------------------------------------------------------------------! |
---|
| 589 | ! Description: |
---|
| 590 | ! ------------ |
---|
[1682] | 591 | !> Resorting data for the transposition from y to z. The transposition itself |
---|
| 592 | !> is carried out in transpose_yz |
---|
[1216] | 593 | !------------------------------------------------------------------------------! |
---|
[1682] | 594 | SUBROUTINE resort_for_yz( f_in, f_inv ) |
---|
[1216] | 595 | |
---|
[1682] | 596 | |
---|
[1320] | 597 | USE indices, & |
---|
| 598 | ONLY: ny |
---|
[1216] | 599 | |
---|
[1320] | 600 | USE kinds |
---|
| 601 | |
---|
| 602 | USE transpose_indices, & |
---|
| 603 | ONLY: nxl_y, nxr_y, nzb_y, nzt_y |
---|
| 604 | |
---|
[1216] | 605 | IMPLICIT NONE |
---|
| 606 | |
---|
[1682] | 607 | REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
| 608 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
[1216] | 609 | |
---|
[1682] | 610 | INTEGER(iwp) :: i !< |
---|
| 611 | INTEGER(iwp) :: j !< |
---|
| 612 | INTEGER(iwp) :: k !< |
---|
[1216] | 613 | |
---|
| 614 | ! |
---|
| 615 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 616 | !-- by MPI contiguous |
---|
| 617 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 618 | !$OMP DO |
---|
[3634] | 619 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 620 | !$ACC PRESENT(f_inv, f_in) |
---|
[1216] | 621 | DO j = 0, ny |
---|
| 622 | DO k = nzb_y, nzt_y |
---|
| 623 | DO i = nxl_y, nxr_y |
---|
| 624 | f_inv(i,k,j) = f_in(j,i,k) |
---|
| 625 | ENDDO |
---|
| 626 | ENDDO |
---|
| 627 | ENDDO |
---|
| 628 | !$OMP END PARALLEL |
---|
| 629 | |
---|
| 630 | END SUBROUTINE resort_for_yz |
---|
| 631 | |
---|
| 632 | |
---|
| 633 | !------------------------------------------------------------------------------! |
---|
| 634 | ! Description: |
---|
| 635 | ! ------------ |
---|
[1682] | 636 | !> Transposition of input array (f_in) from y to z. For the input array, all |
---|
| 637 | !> elements along y reside on the same PE, while after transposition, all |
---|
| 638 | !> elements along z reside on the same PE. |
---|
[1] | 639 | !------------------------------------------------------------------------------! |
---|
[1682] | 640 | SUBROUTINE transpose_yz( f_inv, f_out ) |
---|
[1] | 641 | |
---|
[1682] | 642 | |
---|
[1320] | 643 | USE cpulog, & |
---|
| 644 | ONLY: cpu_log, cpu_log_nowait, log_point_s |
---|
[1] | 645 | |
---|
[1320] | 646 | USE indices, & |
---|
| 647 | ONLY: ny, nz |
---|
| 648 | |
---|
| 649 | USE kinds |
---|
| 650 | |
---|
[1324] | 651 | USE pegrid |
---|
[1320] | 652 | |
---|
| 653 | USE transpose_indices, & |
---|
| 654 | ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y |
---|
| 655 | |
---|
[1] | 656 | IMPLICIT NONE |
---|
| 657 | |
---|
[1682] | 658 | INTEGER(iwp) :: i !< |
---|
| 659 | INTEGER(iwp) :: j !< |
---|
| 660 | INTEGER(iwp) :: k !< |
---|
| 661 | INTEGER(iwp) :: l !< |
---|
| 662 | INTEGER(iwp) :: zs !< |
---|
[1] | 663 | |
---|
[1682] | 664 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
| 665 | REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !< |
---|
[1111] | 666 | |
---|
[1682] | 667 | REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< |
---|
[3634] | 668 | !$ACC DECLARE CREATE(work) |
---|
[1111] | 669 | |
---|
[1320] | 670 | |
---|
[1] | 671 | ! |
---|
| 672 | !-- If the PE grid is one-dimensional along y, only local reordering |
---|
| 673 | !-- of the data is necessary and no transposition has to be done. |
---|
| 674 | IF ( pdims(1) == 1 ) THEN |
---|
[1106] | 675 | |
---|
[683] | 676 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 677 | !$OMP DO |
---|
[3634] | 678 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 679 | !$ACC PRESENT(f_out, f_inv) |
---|
[1003] | 680 | DO j = 0, ny |
---|
| 681 | DO k = nzb_y, nzt_y |
---|
| 682 | DO i = nxl_y, nxr_y |
---|
[164] | 683 | f_out(i,j,k) = f_inv(i,k,j) |
---|
[1] | 684 | ENDDO |
---|
| 685 | ENDDO |
---|
| 686 | ENDDO |
---|
[683] | 687 | !$OMP END PARALLEL |
---|
[1] | 688 | |
---|
[1106] | 689 | ELSE |
---|
| 690 | |
---|
| 691 | #if defined( __parallel ) |
---|
[1] | 692 | ! |
---|
[1106] | 693 | !-- Transpose array |
---|
[1318] | 694 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3634] | 695 | !$ACC UPDATE HOST(f_inv) |
---|
[1106] | 696 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1111] | 697 | CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & |
---|
| 698 | work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & |
---|
[1106] | 699 | comm1dx, ierr ) |
---|
[3634] | 700 | !$ACC UPDATE DEVICE(work) |
---|
[1106] | 701 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1] | 702 | |
---|
| 703 | ! |
---|
[1106] | 704 | !-- Reorder transposed array |
---|
[1111] | 705 | !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) |
---|
[683] | 706 | !$OMP DO |
---|
[1106] | 707 | DO l = 0, pdims(1) - 1 |
---|
| 708 | zs = 1 + l * ( nzt_y - nzb_y + 1 ) |
---|
[3634] | 709 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 710 | !$ACC PRESENT(f_out, work) |
---|
[1106] | 711 | DO j = nys_z, nyn_z |
---|
| 712 | DO k = zs, zs + nzt_y - nzb_y |
---|
| 713 | DO i = nxl_z, nxr_z |
---|
[1111] | 714 | f_out(i,j,k) = work(i,k-zs+1,j,l) |
---|
[1106] | 715 | ENDDO |
---|
[1] | 716 | ENDDO |
---|
| 717 | ENDDO |
---|
| 718 | ENDDO |
---|
[683] | 719 | !$OMP END PARALLEL |
---|
[1] | 720 | #endif |
---|
| 721 | |
---|
[1106] | 722 | ENDIF |
---|
| 723 | |
---|
[1] | 724 | END SUBROUTINE transpose_yz |
---|
| 725 | |
---|
| 726 | |
---|
| 727 | !------------------------------------------------------------------------------! |
---|
| 728 | ! Description: |
---|
| 729 | ! ------------ |
---|
[1682] | 730 | !> Resorting data for the transposition from z to x. The transposition itself |
---|
| 731 | !> is carried out in transpose_zx |
---|
[1216] | 732 | !------------------------------------------------------------------------------! |
---|
[1682] | 733 | SUBROUTINE resort_for_zx( f_in, f_inv ) |
---|
[1216] | 734 | |
---|
[1682] | 735 | |
---|
[1320] | 736 | USE indices, & |
---|
| 737 | ONLY: nxl, nxr, nyn, nys, nz |
---|
[1216] | 738 | |
---|
[1320] | 739 | USE kinds |
---|
| 740 | |
---|
[1216] | 741 | IMPLICIT NONE |
---|
| 742 | |
---|
[1682] | 743 | REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< |
---|
| 744 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
[1216] | 745 | |
---|
[1682] | 746 | INTEGER(iwp) :: i !< |
---|
| 747 | INTEGER(iwp) :: j !< |
---|
| 748 | INTEGER(iwp) :: k !< |
---|
[1216] | 749 | |
---|
| 750 | ! |
---|
| 751 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 752 | !-- by MPI contiguous |
---|
| 753 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 754 | !$OMP DO |
---|
[3634] | 755 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 756 | !$ACC PRESENT(f_in, f_inv) |
---|
[1216] | 757 | DO k = 1,nz |
---|
| 758 | DO i = nxl, nxr |
---|
| 759 | DO j = nys, nyn |
---|
| 760 | f_inv(j,i,k) = f_in(k,j,i) |
---|
| 761 | ENDDO |
---|
| 762 | ENDDO |
---|
| 763 | ENDDO |
---|
| 764 | !$OMP END PARALLEL |
---|
| 765 | |
---|
| 766 | END SUBROUTINE resort_for_zx |
---|
| 767 | |
---|
| 768 | |
---|
| 769 | !------------------------------------------------------------------------------! |
---|
| 770 | ! Description: |
---|
| 771 | ! ------------ |
---|
[1682] | 772 | !> Transposition of input array (f_in) from z to x. For the input array, all |
---|
| 773 | !> elements along z reside on the same PE, while after transposition, all |
---|
| 774 | !> elements along x reside on the same PE. |
---|
[1] | 775 | !------------------------------------------------------------------------------! |
---|
[1682] | 776 | SUBROUTINE transpose_zx( f_inv, f_out ) |
---|
[1] | 777 | |
---|
[1682] | 778 | |
---|
[1320] | 779 | USE cpulog, & |
---|
| 780 | ONLY: cpu_log, cpu_log_nowait, log_point_s |
---|
[1] | 781 | |
---|
[1320] | 782 | USE indices, & |
---|
| 783 | ONLY: nnx, nx, nxl, nxr, nyn, nys, nz |
---|
| 784 | |
---|
| 785 | USE kinds |
---|
| 786 | |
---|
[1324] | 787 | USE pegrid |
---|
[1320] | 788 | |
---|
| 789 | USE transpose_indices, & |
---|
| 790 | ONLY: nyn_x, nys_x, nzb_x, nzt_x |
---|
| 791 | |
---|
[1] | 792 | IMPLICIT NONE |
---|
| 793 | |
---|
[1682] | 794 | INTEGER(iwp) :: i !< |
---|
| 795 | INTEGER(iwp) :: j !< |
---|
| 796 | INTEGER(iwp) :: k !< |
---|
| 797 | INTEGER(iwp) :: l !< |
---|
| 798 | INTEGER(iwp) :: xs !< |
---|
[1] | 799 | |
---|
[1682] | 800 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
| 801 | REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
[1111] | 802 | |
---|
[1682] | 803 | REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< |
---|
[3634] | 804 | !$ACC DECLARE CREATE(work) |
---|
[1] | 805 | |
---|
[1320] | 806 | |
---|
[1] | 807 | ! |
---|
| 808 | !-- If the PE grid is one-dimensional along y, only local reordering |
---|
| 809 | !-- of the data is necessary and no transposition has to be done. |
---|
| 810 | IF ( pdims(1) == 1 ) THEN |
---|
[1106] | 811 | |
---|
[683] | 812 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 813 | !$OMP DO |
---|
[3634] | 814 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 815 | !$ACC PRESENT(f_out, f_inv) |
---|
[1003] | 816 | DO k = 1, nz |
---|
| 817 | DO i = nxl, nxr |
---|
| 818 | DO j = nys, nyn |
---|
[164] | 819 | f_out(i,j,k) = f_inv(j,i,k) |
---|
[1] | 820 | ENDDO |
---|
| 821 | ENDDO |
---|
| 822 | ENDDO |
---|
[683] | 823 | !$OMP END PARALLEL |
---|
[1] | 824 | |
---|
[1106] | 825 | ELSE |
---|
| 826 | |
---|
| 827 | #if defined( __parallel ) |
---|
[1] | 828 | ! |
---|
[1106] | 829 | !-- Transpose array |
---|
[1318] | 830 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3634] | 831 | !$ACC UPDATE HOST(f_inv) |
---|
[1106] | 832 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1111] | 833 | CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & |
---|
| 834 | work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & |
---|
[1106] | 835 | comm1dx, ierr ) |
---|
[3634] | 836 | !$ACC UPDATE DEVICE(work) |
---|
[1106] | 837 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1] | 838 | |
---|
| 839 | ! |
---|
[1106] | 840 | !-- Reorder transposed array |
---|
[1111] | 841 | !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) |
---|
[683] | 842 | !$OMP DO |
---|
[1106] | 843 | DO l = 0, pdims(1) - 1 |
---|
| 844 | xs = 0 + l * nnx |
---|
[3634] | 845 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 846 | !$ACC PRESENT(f_out, work) |
---|
[1106] | 847 | DO k = nzb_x, nzt_x |
---|
| 848 | DO i = xs, xs + nnx - 1 |
---|
| 849 | DO j = nys_x, nyn_x |
---|
[1111] | 850 | f_out(i,j,k) = work(j,i-xs+1,k,l) |
---|
[1106] | 851 | ENDDO |
---|
[1] | 852 | ENDDO |
---|
| 853 | ENDDO |
---|
| 854 | ENDDO |
---|
[683] | 855 | !$OMP END PARALLEL |
---|
[1] | 856 | #endif |
---|
| 857 | |
---|
[1106] | 858 | ENDIF |
---|
| 859 | |
---|
[1] | 860 | END SUBROUTINE transpose_zx |
---|
| 861 | |
---|
| 862 | |
---|
| 863 | !------------------------------------------------------------------------------! |
---|
| 864 | ! Description: |
---|
| 865 | ! ------------ |
---|
[1682] | 866 | !> Resorting data after the transposition from z to y. The transposition itself |
---|
| 867 | !> is carried out in transpose_zy |
---|
[1216] | 868 | !------------------------------------------------------------------------------! |
---|
[1682] | 869 | SUBROUTINE resort_for_zy( f_inv, f_out ) |
---|
[1216] | 870 | |
---|
[1682] | 871 | |
---|
[1320] | 872 | USE indices, & |
---|
| 873 | ONLY: ny |
---|
[1216] | 874 | |
---|
[1320] | 875 | USE kinds |
---|
| 876 | |
---|
| 877 | USE transpose_indices, & |
---|
| 878 | ONLY: nxl_y, nxr_y, nzb_y, nzt_y |
---|
| 879 | |
---|
[1216] | 880 | IMPLICIT NONE |
---|
| 881 | |
---|
[1682] | 882 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
| 883 | REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
[1216] | 884 | |
---|
| 885 | |
---|
[1682] | 886 | INTEGER(iwp) :: i !< |
---|
| 887 | INTEGER(iwp) :: j !< |
---|
| 888 | INTEGER(iwp) :: k !< |
---|
[1216] | 889 | |
---|
| 890 | ! |
---|
| 891 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 892 | !-- by MPI contiguous |
---|
| 893 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 894 | !$OMP DO |
---|
[3634] | 895 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 896 | !$ACC PRESENT(f_out, f_inv) |
---|
[1216] | 897 | DO k = nzb_y, nzt_y |
---|
| 898 | DO j = 0, ny |
---|
| 899 | DO i = nxl_y, nxr_y |
---|
| 900 | f_out(j,i,k) = f_inv(i,k,j) |
---|
| 901 | ENDDO |
---|
| 902 | ENDDO |
---|
| 903 | ENDDO |
---|
| 904 | !$OMP END PARALLEL |
---|
| 905 | |
---|
| 906 | END SUBROUTINE resort_for_zy |
---|
| 907 | |
---|
| 908 | |
---|
| 909 | !------------------------------------------------------------------------------! |
---|
[3241] | 910 | ! Description:cpu_log_nowait |
---|
[1216] | 911 | ! ------------ |
---|
[1682] | 912 | !> Transposition of input array (f_in) from z to y. For the input array, all |
---|
| 913 | !> elements along z reside on the same PE, while after transposition, all |
---|
| 914 | !> elements along y reside on the same PE. |
---|
[1] | 915 | !------------------------------------------------------------------------------! |
---|
[1682] | 916 | SUBROUTINE transpose_zy( f_in, f_inv ) |
---|
[1] | 917 | |
---|
[1682] | 918 | |
---|
[1320] | 919 | USE cpulog, & |
---|
| 920 | ONLY: cpu_log, cpu_log_nowait, log_point_s |
---|
[1] | 921 | |
---|
[1320] | 922 | USE indices, & |
---|
| 923 | ONLY: ny, nz |
---|
| 924 | |
---|
| 925 | USE kinds |
---|
| 926 | |
---|
[1324] | 927 | USE pegrid |
---|
[1320] | 928 | |
---|
| 929 | USE transpose_indices, & |
---|
| 930 | ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y |
---|
| 931 | |
---|
[1] | 932 | IMPLICIT NONE |
---|
| 933 | |
---|
[1682] | 934 | INTEGER(iwp) :: i !< |
---|
| 935 | INTEGER(iwp) :: j !< |
---|
| 936 | INTEGER(iwp) :: k !< |
---|
| 937 | INTEGER(iwp) :: l !< |
---|
| 938 | INTEGER(iwp) :: zs !< |
---|
[1] | 939 | |
---|
[1682] | 940 | REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !< |
---|
| 941 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
[1111] | 942 | |
---|
[1682] | 943 | REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< |
---|
[3634] | 944 | !$ACC DECLARE CREATE(work) |
---|
[1111] | 945 | |
---|
[1] | 946 | ! |
---|
| 947 | !-- If the PE grid is one-dimensional along y, the array has only to be |
---|
| 948 | !-- reordered locally and therefore no transposition has to be done. |
---|
| 949 | IF ( pdims(1) /= 1 ) THEN |
---|
[1106] | 950 | |
---|
| 951 | #if defined( __parallel ) |
---|
[1] | 952 | ! |
---|
| 953 | !-- Reorder input array for transposition |
---|
[1111] | 954 | !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) |
---|
[683] | 955 | !$OMP DO |
---|
[1] | 956 | DO l = 0, pdims(1) - 1 |
---|
[1003] | 957 | zs = 1 + l * ( nzt_y - nzb_y + 1 ) |
---|
[3634] | 958 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 959 | !$ACC PRESENT(work, f_in) |
---|
[1003] | 960 | DO j = nys_z, nyn_z |
---|
| 961 | DO k = zs, zs + nzt_y - nzb_y |
---|
| 962 | DO i = nxl_z, nxr_z |
---|
[1111] | 963 | work(i,k-zs+1,j,l) = f_in(i,j,k) |
---|
[1] | 964 | ENDDO |
---|
| 965 | ENDDO |
---|
| 966 | ENDDO |
---|
| 967 | ENDDO |
---|
[683] | 968 | !$OMP END PARALLEL |
---|
[1] | 969 | |
---|
| 970 | ! |
---|
| 971 | !-- Transpose array |
---|
[1318] | 972 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3634] | 973 | !$ACC UPDATE HOST(work) |
---|
[622] | 974 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1111] | 975 | CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & |
---|
| 976 | f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & |
---|
[1] | 977 | comm1dx, ierr ) |
---|
[3634] | 978 | !$ACC UPDATE DEVICE(f_inv) |
---|
[1] | 979 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1106] | 980 | #endif |
---|
[1] | 981 | |
---|
| 982 | ELSE |
---|
| 983 | ! |
---|
[1106] | 984 | !-- Reorder the array in the same way like ALLTOALL did it |
---|
[683] | 985 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 986 | !$OMP DO |
---|
[3634] | 987 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 988 | !$ACC PRESENT(f_inv, f_in) |
---|
[1003] | 989 | DO k = nzb_y, nzt_y |
---|
| 990 | DO j = 0, ny |
---|
| 991 | DO i = nxl_y, nxr_y |
---|
[164] | 992 | f_inv(i,k,j) = f_in(i,j,k) |
---|
| 993 | ENDDO |
---|
| 994 | ENDDO |
---|
| 995 | ENDDO |
---|
[683] | 996 | !$OMP END PARALLEL |
---|
[1106] | 997 | |
---|
| 998 | ENDIF |
---|
| 999 | |
---|
[1] | 1000 | END SUBROUTINE transpose_zy |
---|
| 1001 | |
---|
| 1002 | |
---|
| 1003 | !------------------------------------------------------------------------------! |
---|
| 1004 | ! Description: |
---|
| 1005 | ! ------------ |
---|
[1682] | 1006 | !> Transposition of input array (f_in) from z to y. For the input array, all |
---|
| 1007 | !> elements along z reside on the same PE, while after transposition, all |
---|
| 1008 | !> elements along y reside on the same PE. |
---|
| 1009 | !> This is a direct transposition for arrays with indices in regular order |
---|
| 1010 | !> (k,j,i) (cf. transpose_zy). |
---|
[1] | 1011 | !------------------------------------------------------------------------------! |
---|
[1682] | 1012 | SUBROUTINE transpose_zyd( f_in, f_out ) |
---|
[1] | 1013 | |
---|
[1682] | 1014 | |
---|
[1320] | 1015 | USE cpulog, & |
---|
[3241] | 1016 | ONLY: cpu_log, log_point_s |
---|
[1] | 1017 | |
---|
[1320] | 1018 | USE indices, & |
---|
| 1019 | ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz |
---|
| 1020 | |
---|
| 1021 | USE kinds |
---|
| 1022 | |
---|
[1324] | 1023 | USE pegrid |
---|
[1320] | 1024 | |
---|
| 1025 | USE transpose_indices, & |
---|
[3241] | 1026 | ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd |
---|
[1320] | 1027 | |
---|
[1] | 1028 | IMPLICIT NONE |
---|
| 1029 | |
---|
[1682] | 1030 | INTEGER(iwp) :: i !< |
---|
| 1031 | INTEGER(iwp) :: j !< |
---|
| 1032 | INTEGER(iwp) :: k !< |
---|
| 1033 | INTEGER(iwp) :: l !< |
---|
| 1034 | INTEGER(iwp) :: m !< |
---|
| 1035 | INTEGER(iwp) :: ys !< |
---|
[1] | 1036 | |
---|
[1682] | 1037 | REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< |
---|
| 1038 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
| 1039 | REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !< |
---|
| 1040 | REAL(wp) :: work(nnx*nny*nnz) !< |
---|
[1320] | 1041 | |
---|
[1] | 1042 | #if defined( __parallel ) |
---|
| 1043 | |
---|
| 1044 | ! |
---|
| 1045 | !-- Rearrange indices of input array in order to make data to be send |
---|
| 1046 | !-- by MPI contiguous |
---|
[1003] | 1047 | DO i = nxl, nxr |
---|
| 1048 | DO j = nys, nyn |
---|
| 1049 | DO k = 1, nz |
---|
[164] | 1050 | f_inv(j,i,k) = f_in(k,j,i) |
---|
[1] | 1051 | ENDDO |
---|
| 1052 | ENDDO |
---|
| 1053 | ENDDO |
---|
| 1054 | |
---|
| 1055 | ! |
---|
| 1056 | !-- Move data to different array, because memory location of work1 is |
---|
| 1057 | !-- needed further below (work1 = work2). |
---|
| 1058 | !-- If the PE grid is one-dimensional along x, only local reordering |
---|
| 1059 | !-- of the data is necessary and no transposition has to be done. |
---|
| 1060 | IF ( pdims(2) == 1 ) THEN |
---|
[1003] | 1061 | DO k = 1, nz |
---|
| 1062 | DO i = nxl, nxr |
---|
| 1063 | DO j = nys, nyn |
---|
[164] | 1064 | f_out(j,i,k) = f_inv(j,i,k) |
---|
[1] | 1065 | ENDDO |
---|
| 1066 | ENDDO |
---|
| 1067 | ENDDO |
---|
| 1068 | RETURN |
---|
| 1069 | ENDIF |
---|
| 1070 | |
---|
| 1071 | ! |
---|
| 1072 | !-- Transpose array |
---|
| 1073 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) |
---|
[622] | 1074 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[1] | 1075 | CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, & |
---|
[164] | 1076 | work(1), sendrecvcount_zyd, MPI_REAL, & |
---|
[1] | 1077 | comm1dy, ierr ) |
---|
| 1078 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
| 1079 | |
---|
| 1080 | ! |
---|
| 1081 | !-- Reorder transposed array |
---|
| 1082 | m = 0 |
---|
| 1083 | DO l = 0, pdims(2) - 1 |
---|
| 1084 | ys = 0 + l * nny |
---|
[1003] | 1085 | DO k = nzb_yd, nzt_yd |
---|
| 1086 | DO i = nxl_yd, nxr_yd |
---|
[1] | 1087 | DO j = ys, ys + nny - 1 |
---|
| 1088 | m = m + 1 |
---|
[164] | 1089 | f_out(j,i,k) = work(m) |
---|
[1] | 1090 | ENDDO |
---|
| 1091 | ENDDO |
---|
| 1092 | ENDDO |
---|
| 1093 | ENDDO |
---|
| 1094 | |
---|
| 1095 | #endif |
---|
| 1096 | |
---|
| 1097 | END SUBROUTINE transpose_zyd |
---|