[1682] | 1 | !> @file transpose.f90 |
---|
[4540] | 2 | !--------------------------------------------------------------------------------------------------! |
---|
[2696] | 3 | ! This file is part of the PALM model system. |
---|
[1036] | 4 | ! |
---|
[4540] | 5 | ! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General |
---|
| 6 | ! Public License as published by the Free Software Foundation, either version 3 of the License, or |
---|
| 7 | ! (at your option) any later version. |
---|
[1036] | 8 | ! |
---|
[4540] | 9 | ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the |
---|
| 10 | ! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General |
---|
| 11 | ! Public License for more details. |
---|
[1036] | 12 | ! |
---|
[4540] | 13 | ! You should have received a copy of the GNU General Public License along with PALM. If not, see |
---|
| 14 | ! <http://www.gnu.org/licenses/>. |
---|
[1036] | 15 | ! |
---|
[4360] | 16 | ! Copyright 1997-2020 Leibniz Universitaet Hannover |
---|
[4540] | 17 | !--------------------------------------------------------------------------------------------------! |
---|
[1036] | 18 | ! |
---|
[4540] | 19 | ! |
---|
[484] | 20 | ! Current revisions: |
---|
[1] | 21 | ! ----------------- |
---|
[4717] | 22 | ! |
---|
| 23 | ! |
---|
[1321] | 24 | ! Former revisions: |
---|
| 25 | ! ----------------- |
---|
| 26 | ! $Id: transpose.f90 4717 2020-09-30 22:27:40Z schwenkel $ |
---|
[4717] | 27 | ! Formatting of OpenMP directives (J. Resler) |
---|
| 28 | ! |
---|
| 29 | ! 4540 2020-05-18 15:23:29Z raasch |
---|
[4540] | 30 | ! File re-formatted to follow the PALM coding standard |
---|
| 31 | ! |
---|
| 32 | ! 4429 2020-02-27 15:24:30Z raasch |
---|
| 33 | ! Bugfix: cpp-directives added for serial mode |
---|
| 34 | ! |
---|
[4429] | 35 | ! 4415 2020-02-20 10:30:33Z raasch |
---|
[4540] | 36 | ! Bugfix for misplaced preprocessor directive |
---|
| 37 | ! |
---|
[4415] | 38 | ! 4370 2020-01-10 14:00:44Z raasch |
---|
[4540] | 39 | ! Vector array renamed |
---|
| 40 | ! |
---|
[4370] | 41 | ! 4366 2020-01-09 08:12:43Z raasch |
---|
[4540] | 42 | ! Modifications for NEC vectorization |
---|
| 43 | ! |
---|
[4366] | 44 | ! 4360 2020-01-07 11:25:50Z suehring |
---|
[4236] | 45 | ! Added missing OpenMP directives |
---|
[4540] | 46 | ! |
---|
[4236] | 47 | ! 4182 2019-08-22 15:20:23Z scharf |
---|
[4182] | 48 | ! Corrected "Former revisions" section |
---|
[4540] | 49 | ! |
---|
[4182] | 50 | ! 4171 2019-08-19 17:44:09Z gronemeier |
---|
[4540] | 51 | ! Loop reordering for performance optimization |
---|
[4171] | 52 | ! |
---|
| 53 | ! 3832 2019-03-28 13:16:58Z raasch |
---|
[4540] | 54 | ! Loop reordering for performance optimization |
---|
[4171] | 55 | ! |
---|
[3832] | 56 | ! 3694 2019-01-23 17:01:49Z knoop |
---|
[3634] | 57 | ! OpenACC port for SPEC |
---|
[4171] | 58 | ! |
---|
[4182] | 59 | ! Revision 1.1 1997/07/24 11:25:18 raasch |
---|
| 60 | ! Initial revision |
---|
| 61 | ! |
---|
| 62 | ! |
---|
[1216] | 63 | ! Description: |
---|
| 64 | ! ------------ |
---|
[4540] | 65 | !> Resorting data for the transposition from x to y. The transposition itself is carried out in |
---|
| 66 | !> transpose_xy. |
---|
| 67 | !--------------------------------------------------------------------------------------------------! |
---|
[4181] | 68 | |
---|
| 69 | #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) ) |
---|
| 70 | |
---|
[1682] | 71 | SUBROUTINE resort_for_xy( f_in, f_inv ) |
---|
[1216] | 72 | |
---|
[4171] | 73 | |
---|
[4540] | 74 | USE indices, & |
---|
| 75 | ONLY: nx |
---|
[1216] | 76 | |
---|
[4540] | 77 | USE kinds |
---|
[1320] | 78 | |
---|
[4540] | 79 | USE transpose_indices, & |
---|
| 80 | ONLY: nyn_x, & |
---|
| 81 | nys_x, & |
---|
| 82 | nzb_x, & |
---|
| 83 | nzt_x |
---|
[1320] | 84 | |
---|
[4540] | 85 | IMPLICIT NONE |
---|
[1216] | 86 | |
---|
[4540] | 87 | INTEGER(iwp) :: i !< |
---|
| 88 | INTEGER(iwp) :: j !< |
---|
| 89 | INTEGER(iwp) :: k !< |
---|
[1216] | 90 | |
---|
[4540] | 91 | REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
| 92 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
[1216] | 93 | |
---|
[1] | 94 | ! |
---|
[4540] | 95 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous |
---|
[1216] | 96 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 97 | !$OMP DO |
---|
[3690] | 98 | #if __acc_fft_device |
---|
[4540] | 99 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 100 | !$ACC PRESENT(f_inv, f_in) |
---|
[3690] | 101 | #endif |
---|
[4540] | 102 | DO k = nzb_x, nzt_x |
---|
| 103 | DO j = nys_x, nyn_x |
---|
| 104 | DO i = 0, nx |
---|
| 105 | f_inv(j,k,i) = f_in(i,j,k) |
---|
| 106 | ENDDO |
---|
| 107 | ENDDO |
---|
| 108 | ENDDO |
---|
| 109 | !$OMP END PARALLEL |
---|
[1216] | 110 | |
---|
| 111 | END SUBROUTINE resort_for_xy |
---|
| 112 | |
---|
| 113 | |
---|
[4540] | 114 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 115 | ! Description: |
---|
| 116 | ! ------------ |
---|
[4540] | 117 | !> Transposition of input array (f_in) from x to y. For the input array, all elements along x reside |
---|
| 118 | !> on the same PE, while after transposition, all elements along y reside on the same PE. |
---|
| 119 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 120 | SUBROUTINE transpose_xy( f_inv, f_out ) |
---|
[1] | 121 | |
---|
[1682] | 122 | |
---|
[4429] | 123 | #if defined( __parallel ) |
---|
[4540] | 124 | USE cpulog, & |
---|
| 125 | ONLY: cpu_log, & |
---|
| 126 | cpu_log_nowait, & |
---|
| 127 | log_point_s |
---|
[4429] | 128 | #endif |
---|
[1320] | 129 | |
---|
[4540] | 130 | USE indices, & |
---|
| 131 | ONLY: nx, & |
---|
| 132 | ny |
---|
[4171] | 133 | |
---|
[1320] | 134 | USE kinds |
---|
| 135 | |
---|
[1] | 136 | USE pegrid |
---|
| 137 | |
---|
[4540] | 138 | USE transpose_indices, & |
---|
| 139 | ONLY: nxl_y, & |
---|
| 140 | nxr_y, & |
---|
| 141 | nyn_x, & |
---|
| 142 | nys_x, & |
---|
| 143 | nzb_x, & |
---|
| 144 | nzb_y, & |
---|
| 145 | nzt_x, & |
---|
| 146 | nzt_y |
---|
[1320] | 147 | |
---|
[1] | 148 | IMPLICIT NONE |
---|
| 149 | |
---|
[4171] | 150 | INTEGER(iwp) :: i !< |
---|
| 151 | INTEGER(iwp) :: j !< |
---|
| 152 | INTEGER(iwp) :: k !< |
---|
[4429] | 153 | |
---|
| 154 | #if defined( __parallel ) |
---|
[4540] | 155 | INTEGER(iwp) :: l !< |
---|
| 156 | INTEGER(iwp) :: ys !< |
---|
[4429] | 157 | #endif |
---|
[1] | 158 | |
---|
[4540] | 159 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
| 160 | REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
[4171] | 161 | |
---|
[4429] | 162 | #if defined( __parallel ) |
---|
[4540] | 163 | REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< |
---|
[3690] | 164 | #if __acc_fft_device |
---|
[3634] | 165 | !$ACC DECLARE CREATE(work) |
---|
[3690] | 166 | #endif |
---|
[4429] | 167 | #endif |
---|
[1111] | 168 | |
---|
| 169 | |
---|
[1106] | 170 | IF ( numprocs /= 1 ) THEN |
---|
| 171 | |
---|
| 172 | #if defined( __parallel ) |
---|
[1] | 173 | ! |
---|
[1106] | 174 | !-- Transpose array |
---|
[1318] | 175 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3690] | 176 | |
---|
| 177 | #if __acc_fft_device |
---|
[3657] | 178 | #ifndef __cuda_aware_mpi |
---|
[3634] | 179 | !$ACC UPDATE HOST(f_inv) |
---|
[3657] | 180 | #else |
---|
| 181 | !$ACC HOST_DATA USE_DEVICE(work, f_inv) |
---|
| 182 | #endif |
---|
[3690] | 183 | #endif |
---|
| 184 | |
---|
[1106] | 185 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 186 | CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & |
---|
| 187 | work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr ) |
---|
[3690] | 188 | |
---|
| 189 | #if __acc_fft_device |
---|
[3657] | 190 | #ifndef __cuda_aware_mpi |
---|
[3634] | 191 | !$ACC UPDATE DEVICE(work) |
---|
[3657] | 192 | #else |
---|
| 193 | !$ACC END HOST_DATA |
---|
| 194 | #endif |
---|
[3690] | 195 | #endif |
---|
| 196 | |
---|
[1106] | 197 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1] | 198 | |
---|
| 199 | ! |
---|
[1106] | 200 | !-- Reorder transposed array |
---|
[4717] | 201 | !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) |
---|
[1106] | 202 | DO l = 0, pdims(2) - 1 |
---|
| 203 | ys = 0 + l * ( nyn_x - nys_x + 1 ) |
---|
[3690] | 204 | #if __acc_fft_device |
---|
[3634] | 205 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 206 | !$ACC PRESENT(f_out, work) |
---|
[3690] | 207 | #endif |
---|
[4236] | 208 | !$OMP DO |
---|
[1106] | 209 | DO i = nxl_y, nxr_y |
---|
| 210 | DO k = nzb_y, nzt_y |
---|
| 211 | DO j = ys, ys + nyn_x - nys_x |
---|
[1111] | 212 | f_out(j,i,k) = work(j-ys+1,k,i,l) |
---|
[1106] | 213 | ENDDO |
---|
[1] | 214 | ENDDO |
---|
| 215 | ENDDO |
---|
[4236] | 216 | !$OMP END DO NOWAIT |
---|
[1] | 217 | ENDDO |
---|
[4717] | 218 | !$OMP END PARALLEL |
---|
[1] | 219 | #endif |
---|
| 220 | |
---|
[1106] | 221 | ELSE |
---|
| 222 | |
---|
| 223 | ! |
---|
| 224 | !-- Reorder transposed array |
---|
[4717] | 225 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 226 | !$OMP DO |
---|
[3690] | 227 | #if __acc_fft_device |
---|
[3634] | 228 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 229 | !$ACC PRESENT(f_out, f_inv) |
---|
[3690] | 230 | #endif |
---|
[1106] | 231 | DO k = nzb_y, nzt_y |
---|
| 232 | DO i = nxl_y, nxr_y |
---|
| 233 | DO j = 0, ny |
---|
| 234 | f_out(j,i,k) = f_inv(j,k,i) |
---|
| 235 | ENDDO |
---|
| 236 | ENDDO |
---|
| 237 | ENDDO |
---|
[4717] | 238 | !$OMP END PARALLEL |
---|
[1106] | 239 | |
---|
| 240 | ENDIF |
---|
| 241 | |
---|
[1] | 242 | END SUBROUTINE transpose_xy |
---|
| 243 | |
---|
| 244 | |
---|
[4540] | 245 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 246 | ! Description: |
---|
| 247 | ! ------------ |
---|
[4540] | 248 | !> Resorting data after the transposition from x to z. The transposition itself is carried out in |
---|
| 249 | !> transpose_xz. |
---|
| 250 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 251 | SUBROUTINE resort_for_xz( f_inv, f_out ) |
---|
[1216] | 252 | |
---|
[1682] | 253 | |
---|
[4540] | 254 | USE indices, & |
---|
| 255 | ONLY: nxl, & |
---|
| 256 | nxr, & |
---|
| 257 | nyn, & |
---|
| 258 | nys, & |
---|
| 259 | nz |
---|
[1216] | 260 | |
---|
[4540] | 261 | USE kinds |
---|
[1320] | 262 | |
---|
[4540] | 263 | IMPLICIT NONE |
---|
[1216] | 264 | |
---|
[4540] | 265 | INTEGER(iwp) :: i !< |
---|
| 266 | INTEGER(iwp) :: j !< |
---|
| 267 | INTEGER(iwp) :: k !< |
---|
[1216] | 268 | |
---|
[4540] | 269 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
| 270 | REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !< |
---|
| 271 | |
---|
[1216] | 272 | ! |
---|
[4540] | 273 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous. |
---|
| 274 | !-- In case of parallel fft/transposition, scattered store is faster in backward direction!!! |
---|
[4717] | 275 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 276 | !$OMP DO |
---|
[3690] | 277 | #if __acc_fft_device |
---|
[4717] | 278 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 279 | !$ACC PRESENT(f_out, f_inv) |
---|
[3690] | 280 | #endif |
---|
[4540] | 281 | DO i = nxl, nxr |
---|
| 282 | DO j = nys, nyn |
---|
| 283 | DO k = 1, nz |
---|
| 284 | f_out(k,j,i) = f_inv(j,i,k) |
---|
| 285 | ENDDO |
---|
| 286 | ENDDO |
---|
| 287 | ENDDO |
---|
| 288 | !$OMP END PARALLEL |
---|
[1216] | 289 | |
---|
| 290 | END SUBROUTINE resort_for_xz |
---|
| 291 | |
---|
| 292 | |
---|
[4540] | 293 | !--------------------------------------------------------------------------------------------------! |
---|
[1216] | 294 | ! Description: |
---|
| 295 | ! ------------ |
---|
[4540] | 296 | !> Transposition of input array (f_in) from x to z. For the input array, all elements along x reside |
---|
| 297 | !> on the same PE, while after transposition, all elements along z reside on the same PE. |
---|
| 298 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 299 | SUBROUTINE transpose_xz( f_in, f_inv ) |
---|
[1] | 300 | |
---|
[4429] | 301 | #if defined( __parallel ) |
---|
[4540] | 302 | USE cpulog, & |
---|
| 303 | ONLY: cpu_log, & |
---|
| 304 | cpu_log_nowait, & |
---|
| 305 | log_point_s |
---|
[1] | 306 | |
---|
[4540] | 307 | USE fft_xy, & |
---|
| 308 | ONLY: f_vec_x, & |
---|
| 309 | temperton_fft_vec |
---|
[4429] | 310 | #endif |
---|
[4366] | 311 | |
---|
[4540] | 312 | USE indices, & |
---|
| 313 | ONLY: nx, & |
---|
| 314 | nxl, & |
---|
| 315 | nxr, & |
---|
| 316 | nyn, & |
---|
| 317 | nys, & |
---|
| 318 | nz |
---|
| 319 | |
---|
[4429] | 320 | #if defined( __parallel ) |
---|
[4540] | 321 | USE indices, & |
---|
[4429] | 322 | ONLY: nnx |
---|
| 323 | #endif |
---|
[1320] | 324 | |
---|
| 325 | USE kinds |
---|
| 326 | |
---|
[1324] | 327 | USE pegrid |
---|
[1320] | 328 | |
---|
[4540] | 329 | USE transpose_indices, & |
---|
| 330 | ONLY: nyn_x, & |
---|
| 331 | nys_x, & |
---|
| 332 | nzb_x, & |
---|
| 333 | nzt_x |
---|
[1320] | 334 | |
---|
[1] | 335 | IMPLICIT NONE |
---|
| 336 | |
---|
[4171] | 337 | INTEGER(iwp) :: i !< |
---|
| 338 | INTEGER(iwp) :: j !< |
---|
| 339 | INTEGER(iwp) :: k !< |
---|
[4540] | 340 | |
---|
[4429] | 341 | #if defined( __parallel ) |
---|
[4540] | 342 | INTEGER(iwp) :: l !< |
---|
| 343 | INTEGER(iwp) :: mm !< |
---|
| 344 | INTEGER(iwp) :: xs !< |
---|
[4429] | 345 | #endif |
---|
[1] | 346 | |
---|
[4540] | 347 | REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
| 348 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
[1] | 349 | |
---|
[4429] | 350 | #if defined( __parallel ) |
---|
[4540] | 351 | REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< |
---|
[3690] | 352 | #if __acc_fft_device |
---|
[3634] | 353 | !$ACC DECLARE CREATE(work) |
---|
[3690] | 354 | #endif |
---|
[4429] | 355 | #endif |
---|
[1111] | 356 | |
---|
[4366] | 357 | ! |
---|
[4540] | 358 | !-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and |
---|
| 359 | !-- therefore no transposition has to be done. |
---|
[1] | 360 | IF ( pdims(1) /= 1 ) THEN |
---|
[1106] | 361 | |
---|
| 362 | #if defined( __parallel ) |
---|
[1] | 363 | ! |
---|
[4366] | 364 | !-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in |
---|
[4370] | 365 | !-- different array format (f_vec_x). |
---|
[4366] | 366 | IF ( temperton_fft_vec ) THEN |
---|
| 367 | |
---|
| 368 | DO l = 0, pdims(1) - 1 |
---|
| 369 | xs = 0 + l * nnx |
---|
| 370 | DO k = nzb_x, nzt_x |
---|
| 371 | DO i = xs, xs + nnx - 1 |
---|
| 372 | DO j = nys_x, nyn_x |
---|
| 373 | mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) |
---|
[4370] | 374 | work(j,i-xs+1,k,l) = f_vec_x(mm,i) |
---|
[4366] | 375 | ENDDO |
---|
| 376 | ENDDO |
---|
| 377 | ENDDO |
---|
| 378 | ENDDO |
---|
| 379 | |
---|
| 380 | ELSE |
---|
| 381 | |
---|
| 382 | !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) |
---|
| 383 | DO l = 0, pdims(1) - 1 |
---|
| 384 | xs = 0 + l * nnx |
---|
[3690] | 385 | #if __acc_fft_device |
---|
[4366] | 386 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 387 | !$ACC PRESENT(work, f_in) |
---|
[3690] | 388 | #endif |
---|
[4366] | 389 | !$OMP DO |
---|
| 390 | DO k = nzb_x, nzt_x |
---|
| 391 | DO i = xs, xs + nnx - 1 |
---|
| 392 | DO j = nys_x, nyn_x |
---|
| 393 | work(j,i-xs+1,k,l) = f_in(i,j,k) |
---|
| 394 | ENDDO |
---|
[1] | 395 | ENDDO |
---|
| 396 | ENDDO |
---|
[4366] | 397 | !$OMP END DO NOWAIT |
---|
[1] | 398 | ENDDO |
---|
[4366] | 399 | !$OMP END PARALLEL |
---|
[1] | 400 | |
---|
[4366] | 401 | ENDIF |
---|
| 402 | |
---|
[1] | 403 | ! |
---|
| 404 | !-- Transpose array |
---|
[1318] | 405 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3690] | 406 | |
---|
| 407 | #if __acc_fft_device |
---|
[3657] | 408 | #ifndef __cuda_aware_mpi |
---|
[3634] | 409 | !$ACC UPDATE HOST(work) |
---|
[3657] | 410 | #else |
---|
| 411 | !$ACC HOST_DATA USE_DEVICE(work, f_inv) |
---|
| 412 | #endif |
---|
[3690] | 413 | #endif |
---|
| 414 | |
---|
[622] | 415 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 416 | CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & |
---|
| 417 | f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, comm1dx, ierr ) |
---|
[3690] | 418 | |
---|
| 419 | #if __acc_fft_device |
---|
[3657] | 420 | #ifndef __cuda_aware_mpi |
---|
[3634] | 421 | !$ACC UPDATE DEVICE(f_inv) |
---|
[3657] | 422 | #else |
---|
| 423 | !$ACC END HOST_DATA |
---|
| 424 | #endif |
---|
[3694] | 425 | #endif |
---|
| 426 | |
---|
[1] | 427 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1106] | 428 | #endif |
---|
| 429 | |
---|
[1] | 430 | ELSE |
---|
[1106] | 431 | |
---|
[1] | 432 | ! |
---|
| 433 | !-- Reorder the array in a way that the z index is in first position |
---|
[4717] | 434 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 435 | !$OMP DO |
---|
[3690] | 436 | #if __acc_fft_device |
---|
[3634] | 437 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 438 | !$ACC PRESENT(f_inv, f_in) |
---|
[3690] | 439 | #endif |
---|
[1003] | 440 | DO i = nxl, nxr |
---|
| 441 | DO j = nys, nyn |
---|
| 442 | DO k = 1, nz |
---|
[164] | 443 | f_inv(j,i,k) = f_in(i,j,k) |
---|
[1] | 444 | ENDDO |
---|
| 445 | ENDDO |
---|
| 446 | ENDDO |
---|
[4717] | 447 | !$OMP END PARALLEL |
---|
[1] | 448 | |
---|
[164] | 449 | ENDIF |
---|
| 450 | |
---|
[1] | 451 | END SUBROUTINE transpose_xz |
---|
| 452 | |
---|
| 453 | |
---|
[4540] | 454 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 455 | ! Description: |
---|
| 456 | ! ------------ |
---|
[4540] | 457 | !> Resorting data after the transposition from y to x. The transposition itself is carried out in |
---|
| 458 | !> transpose_yx. |
---|
| 459 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 460 | SUBROUTINE resort_for_yx( f_inv, f_out ) |
---|
[1216] | 461 | |
---|
[1682] | 462 | |
---|
[4540] | 463 | USE indices, & |
---|
| 464 | ONLY: nx |
---|
[1216] | 465 | |
---|
[4540] | 466 | USE kinds |
---|
[1320] | 467 | |
---|
[4540] | 468 | USE transpose_indices, & |
---|
| 469 | ONLY: nyn_x, & |
---|
| 470 | nys_x, & |
---|
| 471 | nzb_x, & |
---|
| 472 | nzt_x |
---|
[1320] | 473 | |
---|
[4540] | 474 | IMPLICIT NONE |
---|
[1216] | 475 | |
---|
[4540] | 476 | INTEGER(iwp) :: i !< |
---|
| 477 | INTEGER(iwp) :: j !< |
---|
| 478 | INTEGER(iwp) :: k !< |
---|
[1216] | 479 | |
---|
[4540] | 480 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
| 481 | REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
[1216] | 482 | |
---|
| 483 | ! |
---|
[4540] | 484 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous. |
---|
[4717] | 485 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 486 | !$OMP DO |
---|
[3690] | 487 | #if __acc_fft_device |
---|
[4540] | 488 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 489 | !$ACC PRESENT(f_out, f_inv) |
---|
[3690] | 490 | #endif |
---|
[4540] | 491 | DO k = nzb_x, nzt_x |
---|
| 492 | DO j = nys_x, nyn_x |
---|
| 493 | DO i = 0, nx |
---|
| 494 | f_out(i,j,k) = f_inv(j,k,i) |
---|
| 495 | ENDDO |
---|
| 496 | ENDDO |
---|
| 497 | ENDDO |
---|
[4717] | 498 | !$OMP END PARALLEL |
---|
[1216] | 499 | |
---|
| 500 | END SUBROUTINE resort_for_yx |
---|
| 501 | |
---|
| 502 | |
---|
[4540] | 503 | !--------------------------------------------------------------------------------------------------! |
---|
[1216] | 504 | ! Description: |
---|
| 505 | ! ------------ |
---|
[4540] | 506 | !> Transposition of input array (f_in) from y to x. For the input array, all elements along y |
---|
| 507 | !> reside on the same PE, while after transposition, all elements along x reside on the same PE. |
---|
| 508 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 509 | SUBROUTINE transpose_yx( f_in, f_inv ) |
---|
[1] | 510 | |
---|
[1682] | 511 | |
---|
[4429] | 512 | #if defined( __parallel ) |
---|
[4540] | 513 | USE cpulog, & |
---|
| 514 | ONLY: cpu_log, & |
---|
| 515 | cpu_log_nowait, & |
---|
| 516 | log_point_s |
---|
[4429] | 517 | #endif |
---|
[1] | 518 | |
---|
[4540] | 519 | USE indices, & |
---|
| 520 | ONLY: nx, & |
---|
| 521 | ny |
---|
[1320] | 522 | |
---|
| 523 | USE kinds |
---|
| 524 | |
---|
[1324] | 525 | USE pegrid |
---|
[1320] | 526 | |
---|
[4540] | 527 | USE transpose_indices, & |
---|
| 528 | ONLY: nxl_y, & |
---|
| 529 | nxr_y, & |
---|
| 530 | nyn_x, & |
---|
| 531 | nys_x, & |
---|
| 532 | nzb_x, & |
---|
| 533 | nzb_y, & |
---|
| 534 | nzt_x, & |
---|
| 535 | nzt_y |
---|
[1320] | 536 | |
---|
[1] | 537 | IMPLICIT NONE |
---|
| 538 | |
---|
[4171] | 539 | INTEGER(iwp) :: i !< |
---|
| 540 | INTEGER(iwp) :: j !< |
---|
| 541 | INTEGER(iwp) :: k !< |
---|
[4540] | 542 | |
---|
[4429] | 543 | #if defined( __parallel ) |
---|
[4540] | 544 | INTEGER(iwp) :: l !< |
---|
| 545 | INTEGER(iwp) :: ys !< |
---|
[4429] | 546 | #endif |
---|
[1] | 547 | |
---|
[4540] | 548 | REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
| 549 | REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< |
---|
[1111] | 550 | |
---|
[4429] | 551 | #if defined( __parallel ) |
---|
[4540] | 552 | REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< |
---|
[3690] | 553 | #if __acc_fft_device |
---|
[3634] | 554 | !$ACC DECLARE CREATE(work) |
---|
[3690] | 555 | #endif |
---|
[4429] | 556 | #endif |
---|
[1111] | 557 | |
---|
[1320] | 558 | |
---|
[1106] | 559 | IF ( numprocs /= 1 ) THEN |
---|
| 560 | |
---|
[1] | 561 | #if defined( __parallel ) |
---|
| 562 | ! |
---|
[1106] | 563 | !-- Reorder input array for transposition |
---|
[4717] | 564 | !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) |
---|
[1106] | 565 | DO l = 0, pdims(2) - 1 |
---|
| 566 | ys = 0 + l * ( nyn_x - nys_x + 1 ) |
---|
[3690] | 567 | #if __acc_fft_device |
---|
[3634] | 568 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 569 | !$ACC PRESENT(work, f_in) |
---|
[3690] | 570 | #endif |
---|
[4236] | 571 | !$OMP DO |
---|
[1106] | 572 | DO i = nxl_y, nxr_y |
---|
| 573 | DO k = nzb_y, nzt_y |
---|
| 574 | DO j = ys, ys + nyn_x - nys_x |
---|
[1111] | 575 | work(j-ys+1,k,i,l) = f_in(j,i,k) |
---|
[1106] | 576 | ENDDO |
---|
| 577 | ENDDO |
---|
| 578 | ENDDO |
---|
[4236] | 579 | !$OMP END DO NOWAIT |
---|
[1106] | 580 | ENDDO |
---|
[4717] | 581 | !$OMP END PARALLEL |
---|
[1106] | 582 | |
---|
| 583 | ! |
---|
| 584 | !-- Transpose array |
---|
[1318] | 585 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3690] | 586 | |
---|
| 587 | #if __acc_fft_device |
---|
[3657] | 588 | #ifndef __cuda_aware_mpi |
---|
[3634] | 589 | !$ACC UPDATE HOST(work) |
---|
[3657] | 590 | #else |
---|
| 591 | !$ACC HOST_DATA USE_DEVICE(work, f_inv) |
---|
| 592 | #endif |
---|
[3690] | 593 | #endif |
---|
| 594 | |
---|
[1106] | 595 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 596 | CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & |
---|
| 597 | f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr ) |
---|
[3690] | 598 | |
---|
| 599 | #if __acc_fft_device |
---|
[3657] | 600 | #ifndef __cuda_aware_mpi |
---|
[3634] | 601 | !$ACC UPDATE DEVICE(f_inv) |
---|
[3657] | 602 | #else |
---|
| 603 | !$ACC END HOST_DATA |
---|
| 604 | #endif |
---|
[3690] | 605 | #endif |
---|
| 606 | |
---|
[1106] | 607 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
| 608 | #endif |
---|
| 609 | |
---|
| 610 | ELSE |
---|
| 611 | |
---|
| 612 | ! |
---|
[4540] | 613 | !-- Reorder array f_in the same way as ALLTOALL did it. |
---|
[4717] | 614 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 615 | !$OMP DO |
---|
[3690] | 616 | #if __acc_fft_device |
---|
[3634] | 617 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 618 | !$ACC PRESENT(f_inv, f_in) |
---|
[3690] | 619 | #endif |
---|
[1003] | 620 | DO i = nxl_y, nxr_y |
---|
| 621 | DO k = nzb_y, nzt_y |
---|
[1106] | 622 | DO j = 0, ny |
---|
| 623 | f_inv(j,k,i) = f_in(j,i,k) |
---|
[1] | 624 | ENDDO |
---|
| 625 | ENDDO |
---|
| 626 | ENDDO |
---|
[4717] | 627 | !$OMP END PARALLEL |
---|
[1] | 628 | |
---|
[1106] | 629 | ENDIF |
---|
[1] | 630 | |
---|
| 631 | END SUBROUTINE transpose_yx |
---|
| 632 | |
---|
| 633 | |
---|
[4540] | 634 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 635 | ! Description: |
---|
| 636 | ! ------------ |
---|
[4540] | 637 | !> Transposition of input array (f_in) from y to x. For the input array, all elements along y reside |
---|
| 638 | !> on the same PE, while after transposition, all elements along x reside on the same PE. This is a |
---|
| 639 | !> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_yx). |
---|
| 640 | !--------------------------------------------------------------------------------------------------! |
---|
[4429] | 641 | #if defined( __parallel ) |
---|
[1682] | 642 | SUBROUTINE transpose_yxd( f_in, f_out ) |
---|
[1] | 643 | |
---|
[1682] | 644 | |
---|
[4540] | 645 | USE cpulog, & |
---|
| 646 | ONLY: cpu_log, & |
---|
| 647 | log_point_s |
---|
[1] | 648 | |
---|
[4540] | 649 | USE indices, & |
---|
| 650 | ONLY: nnx, & |
---|
| 651 | nny, & |
---|
| 652 | nnz, & |
---|
| 653 | nx, & |
---|
| 654 | nxl, & |
---|
| 655 | nxr, & |
---|
| 656 | nyn, & |
---|
| 657 | nys, & |
---|
| 658 | nz |
---|
[1320] | 659 | |
---|
| 660 | USE kinds |
---|
| 661 | |
---|
[1324] | 662 | USE pegrid |
---|
[1320] | 663 | |
---|
[4540] | 664 | USE transpose_indices, & |
---|
| 665 | ONLY: nyn_x, & |
---|
| 666 | nys_x, & |
---|
| 667 | nzb_x, & |
---|
| 668 | nzt_x |
---|
[1320] | 669 | |
---|
[1] | 670 | IMPLICIT NONE |
---|
| 671 | |
---|
[4540] | 672 | INTEGER(iwp) :: i !< |
---|
| 673 | INTEGER(iwp) :: j !< |
---|
| 674 | INTEGER(iwp) :: k !< |
---|
| 675 | INTEGER(iwp) :: l !< |
---|
| 676 | INTEGER(iwp) :: m !< |
---|
| 677 | INTEGER(iwp) :: xs !< |
---|
[1] | 678 | |
---|
[4540] | 679 | REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< |
---|
| 680 | REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !< |
---|
| 681 | REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
| 682 | REAL(wp) :: work(nnx*nny*nnz) !< |
---|
[1] | 683 | |
---|
| 684 | ! |
---|
[4540] | 685 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous. |
---|
[1003] | 686 | DO k = 1, nz |
---|
| 687 | DO j = nys, nyn |
---|
| 688 | DO i = nxl, nxr |
---|
[164] | 689 | f_inv(i,k,j) = f_in(k,j,i) |
---|
[1] | 690 | ENDDO |
---|
| 691 | ENDDO |
---|
| 692 | ENDDO |
---|
| 693 | |
---|
| 694 | ! |
---|
| 695 | !-- Transpose array |
---|
| 696 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) |
---|
[622] | 697 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 698 | CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, & |
---|
| 699 | work(1), sendrecvcount_xy, MPI_REAL, comm1dx, ierr ) |
---|
[1] | 700 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
| 701 | |
---|
| 702 | ! |
---|
| 703 | !-- Reorder transposed array |
---|
| 704 | m = 0 |
---|
| 705 | DO l = 0, pdims(1) - 1 |
---|
| 706 | xs = 0 + l * nnx |
---|
[1003] | 707 | DO j = nys_x, nyn_x |
---|
| 708 | DO k = 1, nz |
---|
[1] | 709 | DO i = xs, xs + nnx - 1 |
---|
| 710 | m = m + 1 |
---|
[164] | 711 | f_out(i,j,k) = work(m) |
---|
[1] | 712 | ENDDO |
---|
| 713 | ENDDO |
---|
| 714 | ENDDO |
---|
| 715 | ENDDO |
---|
| 716 | |
---|
[4429] | 717 | END SUBROUTINE transpose_yxd |
---|
[1] | 718 | #endif |
---|
| 719 | |
---|
| 720 | |
---|
[4540] | 721 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 722 | ! Description: |
---|
| 723 | ! ------------ |
---|
[4540] | 724 | !> Resorting data for the transposition from y to z. The transposition itself is carried out in |
---|
| 725 | !> transpose_yz. |
---|
| 726 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 727 | SUBROUTINE resort_for_yz( f_in, f_inv ) |
---|
[1216] | 728 | |
---|
[1682] | 729 | |
---|
[4540] | 730 | USE indices, & |
---|
| 731 | ONLY: ny |
---|
[1216] | 732 | |
---|
[4540] | 733 | USE kinds |
---|
[1320] | 734 | |
---|
[4540] | 735 | USE transpose_indices, & |
---|
| 736 | ONLY: nxl_y, & |
---|
| 737 | nxr_y, & |
---|
| 738 | nzb_y, & |
---|
| 739 | nzt_y |
---|
[1320] | 740 | |
---|
[4540] | 741 | IMPLICIT NONE |
---|
[1216] | 742 | |
---|
[4540] | 743 | INTEGER(iwp) :: i !< |
---|
| 744 | INTEGER(iwp) :: j !< |
---|
| 745 | INTEGER(iwp) :: k !< |
---|
[1216] | 746 | |
---|
[4540] | 747 | REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
| 748 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
[1216] | 749 | |
---|
| 750 | ! |
---|
[4540] | 751 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous. |
---|
[4717] | 752 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 753 | !$OMP DO |
---|
[3690] | 754 | #if __acc_fft_device |
---|
[4540] | 755 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 756 | !$ACC PRESENT(f_inv, f_in) |
---|
[3690] | 757 | #endif |
---|
[4540] | 758 | DO k = nzb_y, nzt_y |
---|
| 759 | DO i = nxl_y, nxr_y |
---|
| 760 | DO j = 0, ny |
---|
| 761 | f_inv(i,k,j) = f_in(j,i,k) |
---|
| 762 | ENDDO |
---|
| 763 | ENDDO |
---|
| 764 | ENDDO |
---|
[4717] | 765 | !$OMP END PARALLEL |
---|
[1216] | 766 | |
---|
| 767 | END SUBROUTINE resort_for_yz |
---|
| 768 | |
---|
| 769 | |
---|
[4540] | 770 | !--------------------------------------------------------------------------------------------------! |
---|
[1216] | 771 | ! Description: |
---|
| 772 | ! ------------ |
---|
[4540] | 773 | !> Transposition of input array (f_in) from y to z. For the input array, all elements along y reside |
---|
| 774 | !> on the same PE, while after transposition, all elements along z reside on the same PE. |
---|
| 775 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 776 | SUBROUTINE transpose_yz( f_inv, f_out ) |
---|
[1] | 777 | |
---|
[1682] | 778 | |
---|
[4429] | 779 | #if defined( __parallel ) |
---|
[4540] | 780 | USE cpulog, & |
---|
| 781 | ONLY: cpu_log, & |
---|
| 782 | cpu_log_nowait, & |
---|
| 783 | log_point_s |
---|
[4429] | 784 | #endif |
---|
[1] | 785 | |
---|
[4540] | 786 | USE indices, & |
---|
| 787 | ONLY: ny, & |
---|
| 788 | nz |
---|
[1320] | 789 | |
---|
| 790 | USE kinds |
---|
| 791 | |
---|
[1324] | 792 | USE pegrid |
---|
[1320] | 793 | |
---|
[4540] | 794 | USE transpose_indices, & |
---|
| 795 | ONLY: nxl_y, & |
---|
| 796 | nxl_z, & |
---|
| 797 | nxr_y, & |
---|
| 798 | nxr_z, & |
---|
| 799 | nyn_z, & |
---|
| 800 | nys_z, & |
---|
| 801 | nzb_y, & |
---|
| 802 | nzt_y |
---|
[1320] | 803 | |
---|
[1] | 804 | IMPLICIT NONE |
---|
| 805 | |
---|
[4171] | 806 | INTEGER(iwp) :: i !< |
---|
| 807 | INTEGER(iwp) :: j !< |
---|
| 808 | INTEGER(iwp) :: k !< |
---|
[4540] | 809 | |
---|
[4429] | 810 | #if defined( __parallel ) |
---|
[4540] | 811 | INTEGER(iwp) :: l !< |
---|
| 812 | INTEGER(iwp) :: zs !< |
---|
[4429] | 813 | #endif |
---|
[1] | 814 | |
---|
[4540] | 815 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
| 816 | REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !< |
---|
[1111] | 817 | |
---|
[4429] | 818 | #if defined( __parallel ) |
---|
[4540] | 819 | REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< |
---|
[3690] | 820 | #if __acc_fft_device |
---|
[3634] | 821 | !$ACC DECLARE CREATE(work) |
---|
[3690] | 822 | #endif |
---|
[4429] | 823 | #endif |
---|
[1111] | 824 | |
---|
[1320] | 825 | |
---|
[1] | 826 | ! |
---|
[4540] | 827 | !-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no |
---|
| 828 | !-- transposition has to be done. |
---|
[1] | 829 | IF ( pdims(1) == 1 ) THEN |
---|
[1106] | 830 | |
---|
[4717] | 831 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 832 | !$OMP DO |
---|
[3690] | 833 | #if __acc_fft_device |
---|
[3634] | 834 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 835 | !$ACC PRESENT(f_out, f_inv) |
---|
[3690] | 836 | #endif |
---|
[1003] | 837 | DO j = 0, ny |
---|
| 838 | DO k = nzb_y, nzt_y |
---|
| 839 | DO i = nxl_y, nxr_y |
---|
[164] | 840 | f_out(i,j,k) = f_inv(i,k,j) |
---|
[1] | 841 | ENDDO |
---|
| 842 | ENDDO |
---|
| 843 | ENDDO |
---|
[4717] | 844 | !$OMP END PARALLEL |
---|
[1] | 845 | |
---|
[1106] | 846 | ELSE |
---|
| 847 | |
---|
| 848 | #if defined( __parallel ) |
---|
[1] | 849 | ! |
---|
[1106] | 850 | !-- Transpose array |
---|
[1318] | 851 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3690] | 852 | |
---|
| 853 | #if __acc_fft_device |
---|
[3657] | 854 | #ifndef __cuda_aware_mpi |
---|
[3634] | 855 | !$ACC UPDATE HOST(f_inv) |
---|
[3657] | 856 | #else |
---|
| 857 | !$ACC HOST_DATA USE_DEVICE(work, f_inv) |
---|
| 858 | #endif |
---|
[3690] | 859 | #endif |
---|
| 860 | |
---|
[1106] | 861 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 862 | CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & |
---|
| 863 | work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr ) |
---|
[3690] | 864 | |
---|
| 865 | #if __acc_fft_device |
---|
[3657] | 866 | #ifndef __cuda_aware_mpi |
---|
[3634] | 867 | !$ACC UPDATE DEVICE(work) |
---|
[3657] | 868 | #else |
---|
| 869 | !$ACC END HOST_DATA |
---|
| 870 | #endif |
---|
[3690] | 871 | #endif |
---|
| 872 | |
---|
[1106] | 873 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1] | 874 | |
---|
| 875 | ! |
---|
[1106] | 876 | !-- Reorder transposed array |
---|
[4717] | 877 | !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) |
---|
[1106] | 878 | DO l = 0, pdims(1) - 1 |
---|
| 879 | zs = 1 + l * ( nzt_y - nzb_y + 1 ) |
---|
[3690] | 880 | #if __acc_fft_device |
---|
[3634] | 881 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 882 | !$ACC PRESENT(f_out, work) |
---|
[3690] | 883 | #endif |
---|
[4236] | 884 | !$OMP DO |
---|
[1106] | 885 | DO j = nys_z, nyn_z |
---|
| 886 | DO k = zs, zs + nzt_y - nzb_y |
---|
| 887 | DO i = nxl_z, nxr_z |
---|
[1111] | 888 | f_out(i,j,k) = work(i,k-zs+1,j,l) |
---|
[1106] | 889 | ENDDO |
---|
[1] | 890 | ENDDO |
---|
| 891 | ENDDO |
---|
[4236] | 892 | !$OMP END DO NOWAIT |
---|
[1] | 893 | ENDDO |
---|
[4717] | 894 | !$OMP END PARALLEL |
---|
[1] | 895 | #endif |
---|
| 896 | |
---|
[1106] | 897 | ENDIF |
---|
| 898 | |
---|
[1] | 899 | END SUBROUTINE transpose_yz |
---|
| 900 | |
---|
| 901 | |
---|
[4540] | 902 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 903 | ! Description: |
---|
| 904 | ! ------------ |
---|
[4540] | 905 | !> Resorting data for the transposition from z to x. The transposition itself is carried out in |
---|
| 906 | !> transpose_zx. |
---|
| 907 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 908 | SUBROUTINE resort_for_zx( f_in, f_inv ) |
---|
[1216] | 909 | |
---|
[1682] | 910 | |
---|
[4540] | 911 | USE indices, & |
---|
| 912 | ONLY: nxl, & |
---|
| 913 | nxr, & |
---|
| 914 | nyn, & |
---|
| 915 | nys, & |
---|
| 916 | nz |
---|
[1216] | 917 | |
---|
[4540] | 918 | USE kinds |
---|
[1320] | 919 | |
---|
[4540] | 920 | IMPLICIT NONE |
---|
[1216] | 921 | |
---|
[4540] | 922 | INTEGER(iwp) :: i !< |
---|
| 923 | INTEGER(iwp) :: j !< |
---|
| 924 | INTEGER(iwp) :: k !< |
---|
[1216] | 925 | |
---|
[4540] | 926 | REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< |
---|
| 927 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
[1216] | 928 | |
---|
| 929 | ! |
---|
[4540] | 930 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous. |
---|
[4717] | 931 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 932 | !$OMP DO |
---|
[3690] | 933 | #if __acc_fft_device |
---|
[4717] | 934 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 935 | !$ACC PRESENT(f_in, f_inv) |
---|
[3690] | 936 | #endif |
---|
[4540] | 937 | DO i = nxl, nxr |
---|
| 938 | DO j = nys, nyn |
---|
| 939 | DO k = 1,nz |
---|
| 940 | f_inv(j,i,k) = f_in(k,j,i) |
---|
| 941 | ENDDO |
---|
| 942 | ENDDO |
---|
| 943 | ENDDO |
---|
[4717] | 944 | !$OMP END PARALLEL |
---|
[1216] | 945 | |
---|
| 946 | END SUBROUTINE resort_for_zx |
---|
| 947 | |
---|
| 948 | |
---|
[4540] | 949 | !--------------------------------------------------------------------------------------------------! |
---|
[1216] | 950 | ! Description: |
---|
| 951 | ! ------------ |
---|
[4540] | 952 | !> Transposition of input array (f_in) from z to x. For the input array, all elements along z reside |
---|
| 953 | !> on the same PE, while after transposition, all elements along x reside on the same PE. |
---|
| 954 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 955 | SUBROUTINE transpose_zx( f_inv, f_out ) |
---|
[1] | 956 | |
---|
[1682] | 957 | |
---|
[4429] | 958 | #if defined( __parallel ) |
---|
[4540] | 959 | USE cpulog, & |
---|
| 960 | ONLY: cpu_log, & |
---|
| 961 | cpu_log_nowait, & |
---|
| 962 | log_point_s |
---|
[1] | 963 | |
---|
[4540] | 964 | USE fft_xy, & |
---|
| 965 | ONLY: f_vec_x, & |
---|
| 966 | temperton_fft_vec |
---|
[4429] | 967 | #endif |
---|
[4366] | 968 | |
---|
[4540] | 969 | USE indices, & |
---|
| 970 | ONLY: nx, & |
---|
| 971 | nxl, & |
---|
| 972 | nxr, & |
---|
| 973 | nyn, & |
---|
| 974 | nys, & |
---|
| 975 | nz |
---|
| 976 | |
---|
[4429] | 977 | #if defined( __parallel ) |
---|
[4540] | 978 | USE indices, & |
---|
[4429] | 979 | ONLY: nnx |
---|
| 980 | #endif |
---|
[1320] | 981 | |
---|
| 982 | USE kinds |
---|
| 983 | |
---|
[1324] | 984 | USE pegrid |
---|
[1320] | 985 | |
---|
[4540] | 986 | USE transpose_indices, & |
---|
| 987 | ONLY: nyn_x, & |
---|
| 988 | nys_x, & |
---|
| 989 | nzb_x, & |
---|
| 990 | nzt_x |
---|
[1320] | 991 | |
---|
[1] | 992 | IMPLICIT NONE |
---|
| 993 | |
---|
[4171] | 994 | INTEGER(iwp) :: i !< |
---|
| 995 | INTEGER(iwp) :: j !< |
---|
| 996 | INTEGER(iwp) :: k !< |
---|
[4540] | 997 | |
---|
[4429] | 998 | #if defined( __parallel ) |
---|
[4540] | 999 | INTEGER(iwp) :: l !< |
---|
| 1000 | INTEGER(iwp) :: mm !< |
---|
| 1001 | INTEGER(iwp) :: xs !< |
---|
[4429] | 1002 | #endif |
---|
[1] | 1003 | |
---|
[4540] | 1004 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
| 1005 | REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< |
---|
[1111] | 1006 | |
---|
[4429] | 1007 | #if defined( __parallel ) |
---|
[4540] | 1008 | REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< |
---|
[3690] | 1009 | #if __acc_fft_device |
---|
[3634] | 1010 | !$ACC DECLARE CREATE(work) |
---|
[3690] | 1011 | #endif |
---|
[4429] | 1012 | #endif |
---|
[1] | 1013 | |
---|
[1320] | 1014 | |
---|
[1] | 1015 | ! |
---|
[4540] | 1016 | !-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no |
---|
| 1017 | !-- transposition has to be done. |
---|
[1] | 1018 | IF ( pdims(1) == 1 ) THEN |
---|
[1106] | 1019 | |
---|
[4717] | 1020 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 1021 | !$OMP DO |
---|
[3690] | 1022 | #if __acc_fft_device |
---|
[3634] | 1023 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 1024 | !$ACC PRESENT(f_out, f_inv) |
---|
[3690] | 1025 | #endif |
---|
[1003] | 1026 | DO k = 1, nz |
---|
| 1027 | DO i = nxl, nxr |
---|
| 1028 | DO j = nys, nyn |
---|
[164] | 1029 | f_out(i,j,k) = f_inv(j,i,k) |
---|
[1] | 1030 | ENDDO |
---|
| 1031 | ENDDO |
---|
| 1032 | ENDDO |
---|
[4717] | 1033 | !$OMP END PARALLEL |
---|
[1] | 1034 | |
---|
[1106] | 1035 | ELSE |
---|
| 1036 | |
---|
| 1037 | #if defined( __parallel ) |
---|
[1] | 1038 | ! |
---|
[1106] | 1039 | !-- Transpose array |
---|
[1318] | 1040 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3690] | 1041 | |
---|
| 1042 | #if __acc_fft_device |
---|
[3657] | 1043 | #ifndef __cuda_aware_mpi |
---|
[3634] | 1044 | !$ACC UPDATE HOST(f_inv) |
---|
[3657] | 1045 | #else |
---|
| 1046 | !$ACC HOST_DATA USE_DEVICE(work, f_inv) |
---|
| 1047 | #endif |
---|
[3690] | 1048 | #endif |
---|
| 1049 | |
---|
[1106] | 1050 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 1051 | CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & |
---|
| 1052 | work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, comm1dx, ierr ) |
---|
[3690] | 1053 | |
---|
| 1054 | #if __acc_fft_device |
---|
[3657] | 1055 | #ifndef __cuda_aware_mpi |
---|
[3634] | 1056 | !$ACC UPDATE DEVICE(work) |
---|
[3657] | 1057 | #else |
---|
| 1058 | !$ACC END HOST_DATA |
---|
| 1059 | #endif |
---|
[3690] | 1060 | #endif |
---|
| 1061 | |
---|
[1106] | 1062 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1] | 1063 | |
---|
| 1064 | ! |
---|
[4366] | 1065 | !-- Reorder transposed array. |
---|
[4370] | 1066 | !-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which |
---|
| 1067 | !-- saves additional data copy in fft_x. |
---|
[4366] | 1068 | IF ( temperton_fft_vec ) THEN |
---|
| 1069 | |
---|
| 1070 | DO l = 0, pdims(1) - 1 |
---|
| 1071 | xs = 0 + l * nnx |
---|
| 1072 | DO k = nzb_x, nzt_x |
---|
| 1073 | DO i = xs, xs + nnx - 1 |
---|
| 1074 | DO j = nys_x, nyn_x |
---|
| 1075 | mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) |
---|
[4370] | 1076 | f_vec_x(mm,i) = work(j,i-xs+1,k,l) |
---|
[4366] | 1077 | ENDDO |
---|
| 1078 | ENDDO |
---|
| 1079 | ENDDO |
---|
| 1080 | ENDDO |
---|
| 1081 | |
---|
| 1082 | ELSE |
---|
| 1083 | |
---|
[4717] | 1084 | !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) |
---|
[4366] | 1085 | DO l = 0, pdims(1) - 1 |
---|
| 1086 | xs = 0 + l * nnx |
---|
[3690] | 1087 | #if __acc_fft_device |
---|
[4366] | 1088 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 1089 | !$ACC PRESENT(f_out, work) |
---|
[3690] | 1090 | #endif |
---|
[4366] | 1091 | !$OMP DO |
---|
| 1092 | DO k = nzb_x, nzt_x |
---|
| 1093 | DO i = xs, xs + nnx - 1 |
---|
| 1094 | DO j = nys_x, nyn_x |
---|
| 1095 | f_out(i,j,k) = work(j,i-xs+1,k,l) |
---|
| 1096 | ENDDO |
---|
[1106] | 1097 | ENDDO |
---|
[1] | 1098 | ENDDO |
---|
[4366] | 1099 | !$OMP END DO NOWAIT |
---|
[1] | 1100 | ENDDO |
---|
[4717] | 1101 | !$OMP END PARALLEL |
---|
[1] | 1102 | |
---|
[4366] | 1103 | ENDIF |
---|
| 1104 | |
---|
[4415] | 1105 | #endif |
---|
| 1106 | |
---|
[1106] | 1107 | ENDIF |
---|
| 1108 | |
---|
[1] | 1109 | END SUBROUTINE transpose_zx |
---|
| 1110 | |
---|
| 1111 | |
---|
[4540] | 1112 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 1113 | ! Description: |
---|
| 1114 | ! ------------ |
---|
[4540] | 1115 | !> Resorting data after the transposition from z to y. The transposition itself is carried out in |
---|
| 1116 | !> transpose_zy. |
---|
| 1117 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 1118 | SUBROUTINE resort_for_zy( f_inv, f_out ) |
---|
[1216] | 1119 | |
---|
[1682] | 1120 | |
---|
[4540] | 1121 | USE indices, & |
---|
| 1122 | ONLY: ny |
---|
[1216] | 1123 | |
---|
[4540] | 1124 | USE kinds |
---|
[1320] | 1125 | |
---|
[4540] | 1126 | USE transpose_indices, & |
---|
| 1127 | ONLY: nxl_y, & |
---|
| 1128 | nxr_y, & |
---|
| 1129 | nzb_y, & |
---|
| 1130 | nzt_y |
---|
[1320] | 1131 | |
---|
[4540] | 1132 | IMPLICIT NONE |
---|
[1216] | 1133 | |
---|
[4540] | 1134 | INTEGER(iwp) :: i !< |
---|
| 1135 | INTEGER(iwp) :: j !< |
---|
| 1136 | INTEGER(iwp) :: k !< |
---|
[1216] | 1137 | |
---|
[4540] | 1138 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
| 1139 | REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< |
---|
[1216] | 1140 | |
---|
| 1141 | ! |
---|
[4540] | 1142 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous. |
---|
[4717] | 1143 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 1144 | !$OMP DO |
---|
[3690] | 1145 | #if __acc_fft_device |
---|
[3634] | 1146 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 1147 | !$ACC PRESENT(f_out, f_inv) |
---|
[3690] | 1148 | #endif |
---|
[4540] | 1149 | DO k = nzb_y, nzt_y |
---|
| 1150 | DO i = nxl_y, nxr_y |
---|
| 1151 | DO j = 0, ny |
---|
| 1152 | f_out(j,i,k) = f_inv(i,k,j) |
---|
| 1153 | ENDDO |
---|
| 1154 | ENDDO |
---|
| 1155 | ENDDO |
---|
[4717] | 1156 | !$OMP END PARALLEL |
---|
[1216] | 1157 | |
---|
| 1158 | END SUBROUTINE resort_for_zy |
---|
| 1159 | |
---|
| 1160 | |
---|
[4540] | 1161 | !--------------------------------------------------------------------------------------------------! |
---|
[3241] | 1162 | ! Description:cpu_log_nowait |
---|
[1216] | 1163 | ! ------------ |
---|
[4540] | 1164 | !> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside |
---|
| 1165 | !> on the same PE, while after transposition, all elements along y reside on the same PE. |
---|
| 1166 | !--------------------------------------------------------------------------------------------------! |
---|
[1682] | 1167 | SUBROUTINE transpose_zy( f_in, f_inv ) |
---|
[1] | 1168 | |
---|
[1682] | 1169 | |
---|
[4429] | 1170 | #if defined( __parallel ) |
---|
[4540] | 1171 | USE cpulog, & |
---|
| 1172 | ONLY: cpu_log, & |
---|
| 1173 | cpu_log_nowait, & |
---|
| 1174 | log_point_s |
---|
[4429] | 1175 | #endif |
---|
[1] | 1176 | |
---|
[4540] | 1177 | USE indices, & |
---|
| 1178 | ONLY: ny, & |
---|
| 1179 | nz |
---|
[1320] | 1180 | |
---|
| 1181 | USE kinds |
---|
| 1182 | |
---|
[1324] | 1183 | USE pegrid |
---|
[1320] | 1184 | |
---|
[4540] | 1185 | USE transpose_indices, & |
---|
| 1186 | ONLY: nxl_y, & |
---|
| 1187 | nxl_z, & |
---|
| 1188 | nxr_y, & |
---|
| 1189 | nxr_z, & |
---|
| 1190 | nyn_z, & |
---|
| 1191 | nys_z, & |
---|
| 1192 | nzb_y, & |
---|
| 1193 | nzt_y |
---|
[1320] | 1194 | |
---|
[1] | 1195 | IMPLICIT NONE |
---|
| 1196 | |
---|
[4171] | 1197 | INTEGER(iwp) :: i !< |
---|
| 1198 | INTEGER(iwp) :: j !< |
---|
| 1199 | INTEGER(iwp) :: k !< |
---|
[4540] | 1200 | |
---|
[4429] | 1201 | #if defined( __parallel ) |
---|
[4540] | 1202 | INTEGER(iwp) :: l !< |
---|
| 1203 | INTEGER(iwp) :: zs !< |
---|
[4429] | 1204 | #endif |
---|
[1] | 1205 | |
---|
[4540] | 1206 | REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !< |
---|
| 1207 | REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< |
---|
[1111] | 1208 | |
---|
[4429] | 1209 | #if defined( __parallel ) |
---|
[4540] | 1210 | REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< |
---|
[3690] | 1211 | #if __acc_fft_device |
---|
[3634] | 1212 | !$ACC DECLARE CREATE(work) |
---|
[3690] | 1213 | #endif |
---|
[4429] | 1214 | #endif |
---|
[1111] | 1215 | |
---|
[1] | 1216 | ! |
---|
[4540] | 1217 | !-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and |
---|
| 1218 | !-- therefore no transposition has to be done. |
---|
[1] | 1219 | IF ( pdims(1) /= 1 ) THEN |
---|
[1106] | 1220 | |
---|
| 1221 | #if defined( __parallel ) |
---|
[1] | 1222 | ! |
---|
| 1223 | !-- Reorder input array for transposition |
---|
[4717] | 1224 | !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) |
---|
[1] | 1225 | DO l = 0, pdims(1) - 1 |
---|
[1003] | 1226 | zs = 1 + l * ( nzt_y - nzb_y + 1 ) |
---|
[3690] | 1227 | #if __acc_fft_device |
---|
[3634] | 1228 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 1229 | !$ACC PRESENT(work, f_in) |
---|
[3690] | 1230 | #endif |
---|
[4236] | 1231 | !$OMP DO |
---|
[1003] | 1232 | DO j = nys_z, nyn_z |
---|
| 1233 | DO k = zs, zs + nzt_y - nzb_y |
---|
| 1234 | DO i = nxl_z, nxr_z |
---|
[1111] | 1235 | work(i,k-zs+1,j,l) = f_in(i,j,k) |
---|
[1] | 1236 | ENDDO |
---|
| 1237 | ENDDO |
---|
| 1238 | ENDDO |
---|
[4236] | 1239 | !$OMP END DO NOWAIT |
---|
[1] | 1240 | ENDDO |
---|
[4717] | 1241 | !$OMP END PARALLEL |
---|
[1] | 1242 | |
---|
| 1243 | ! |
---|
| 1244 | !-- Transpose array |
---|
[1318] | 1245 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) |
---|
[3690] | 1246 | |
---|
| 1247 | #if __acc_fft_device |
---|
[3657] | 1248 | #ifndef __cuda_aware_mpi |
---|
[3634] | 1249 | !$ACC UPDATE HOST(work) |
---|
[3657] | 1250 | #else |
---|
| 1251 | !$ACC HOST_DATA USE_DEVICE(work, f_inv) |
---|
| 1252 | #endif |
---|
[3690] | 1253 | #endif |
---|
| 1254 | |
---|
[622] | 1255 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 1256 | CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & |
---|
| 1257 | f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr ) |
---|
[3690] | 1258 | |
---|
| 1259 | #if __acc_fft_device |
---|
[3657] | 1260 | #ifndef __cuda_aware_mpi |
---|
[3634] | 1261 | !$ACC UPDATE DEVICE(f_inv) |
---|
[3657] | 1262 | #else |
---|
| 1263 | !$ACC END HOST_DATA |
---|
| 1264 | #endif |
---|
[3690] | 1265 | #endif |
---|
| 1266 | |
---|
[1] | 1267 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
[1106] | 1268 | #endif |
---|
[1] | 1269 | |
---|
| 1270 | ELSE |
---|
| 1271 | ! |
---|
[1106] | 1272 | !-- Reorder the array in the same way like ALLTOALL did it |
---|
[4717] | 1273 | !$OMP PARALLEL PRIVATE ( i, j, k ) |
---|
| 1274 | !$OMP DO |
---|
[3690] | 1275 | #if __acc_fft_device |
---|
[3634] | 1276 | !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & |
---|
| 1277 | !$ACC PRESENT(f_inv, f_in) |
---|
[3690] | 1278 | #endif |
---|
[1003] | 1279 | DO k = nzb_y, nzt_y |
---|
| 1280 | DO j = 0, ny |
---|
| 1281 | DO i = nxl_y, nxr_y |
---|
[164] | 1282 | f_inv(i,k,j) = f_in(i,j,k) |
---|
| 1283 | ENDDO |
---|
| 1284 | ENDDO |
---|
| 1285 | ENDDO |
---|
[4717] | 1286 | !$OMP END PARALLEL |
---|
[1106] | 1287 | |
---|
| 1288 | ENDIF |
---|
| 1289 | |
---|
[1] | 1290 | END SUBROUTINE transpose_zy |
---|
| 1291 | |
---|
| 1292 | |
---|
[4540] | 1293 | !--------------------------------------------------------------------------------------------------! |
---|
[1] | 1294 | ! Description: |
---|
| 1295 | ! ------------ |
---|
[4540] | 1296 | !> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside |
---|
| 1297 | !> on the same PE, while after transposition, all elements along y reside on the same PE. This is a |
---|
| 1298 | !> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_zy). |
---|
| 1299 | !--------------------------------------------------------------------------------------------------! |
---|
[4429] | 1300 | #if defined( __parallel ) |
---|
[1682] | 1301 | SUBROUTINE transpose_zyd( f_in, f_out ) |
---|
[1] | 1302 | |
---|
[1682] | 1303 | |
---|
[4540] | 1304 | USE cpulog, & |
---|
| 1305 | ONLY: cpu_log, & |
---|
| 1306 | log_point_s |
---|
[1] | 1307 | |
---|
[4540] | 1308 | USE indices, & |
---|
| 1309 | ONLY: nnx, & |
---|
| 1310 | nny, & |
---|
| 1311 | nnz, & |
---|
| 1312 | nxl, & |
---|
| 1313 | nxr, & |
---|
| 1314 | nyn, & |
---|
| 1315 | nys, & |
---|
| 1316 | ny, & |
---|
| 1317 | nz |
---|
[1320] | 1318 | |
---|
| 1319 | USE kinds |
---|
| 1320 | |
---|
[1324] | 1321 | USE pegrid |
---|
[1320] | 1322 | |
---|
[4540] | 1323 | USE transpose_indices, & |
---|
| 1324 | ONLY: nxl_yd, & |
---|
| 1325 | nxr_yd, & |
---|
| 1326 | nzb_yd, & |
---|
| 1327 | nzt_yd |
---|
[1320] | 1328 | |
---|
[1] | 1329 | IMPLICIT NONE |
---|
| 1330 | |
---|
[4540] | 1331 | INTEGER(iwp) :: i !< |
---|
| 1332 | INTEGER(iwp) :: j !< |
---|
| 1333 | INTEGER(iwp) :: k !< |
---|
| 1334 | INTEGER(iwp) :: l !< |
---|
| 1335 | INTEGER(iwp) :: m !< |
---|
| 1336 | INTEGER(iwp) :: ys !< |
---|
[1] | 1337 | |
---|
[4540] | 1338 | REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< |
---|
| 1339 | REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< |
---|
| 1340 | REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !< |
---|
| 1341 | REAL(wp) :: work(nnx*nny*nnz) !< |
---|
[1320] | 1342 | |
---|
[1] | 1343 | ! |
---|
[4540] | 1344 | !-- Rearrange indices of input array in order to make data to be send by MPI contiguous. |
---|
[1003] | 1345 | DO i = nxl, nxr |
---|
| 1346 | DO j = nys, nyn |
---|
| 1347 | DO k = 1, nz |
---|
[164] | 1348 | f_inv(j,i,k) = f_in(k,j,i) |
---|
[1] | 1349 | ENDDO |
---|
| 1350 | ENDDO |
---|
| 1351 | ENDDO |
---|
| 1352 | |
---|
| 1353 | ! |
---|
[4540] | 1354 | !-- Move data to different array, because memory location of work1 is needed further below |
---|
| 1355 | !-- (work1 = work2). If the PE grid is one-dimensional along x, only local reordering of the data is |
---|
| 1356 | !-- necessary and no transposition has to be done. |
---|
[1] | 1357 | IF ( pdims(2) == 1 ) THEN |
---|
[1003] | 1358 | DO k = 1, nz |
---|
| 1359 | DO i = nxl, nxr |
---|
| 1360 | DO j = nys, nyn |
---|
[164] | 1361 | f_out(j,i,k) = f_inv(j,i,k) |
---|
[1] | 1362 | ENDDO |
---|
| 1363 | ENDDO |
---|
| 1364 | ENDDO |
---|
| 1365 | RETURN |
---|
| 1366 | ENDIF |
---|
| 1367 | |
---|
| 1368 | ! |
---|
| 1369 | !-- Transpose array |
---|
| 1370 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) |
---|
[622] | 1371 | IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) |
---|
[4540] | 1372 | CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, & |
---|
| 1373 | work(1), sendrecvcount_zyd, MPI_REAL, comm1dy, ierr ) |
---|
[1] | 1374 | CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) |
---|
| 1375 | |
---|
| 1376 | ! |
---|
| 1377 | !-- Reorder transposed array |
---|
| 1378 | m = 0 |
---|
| 1379 | DO l = 0, pdims(2) - 1 |
---|
| 1380 | ys = 0 + l * nny |
---|
[1003] | 1381 | DO k = nzb_yd, nzt_yd |
---|
| 1382 | DO i = nxl_yd, nxr_yd |
---|
[1] | 1383 | DO j = ys, ys + nny - 1 |
---|
| 1384 | m = m + 1 |
---|
[164] | 1385 | f_out(j,i,k) = work(m) |
---|
[1] | 1386 | ENDDO |
---|
| 1387 | ENDDO |
---|
| 1388 | ENDDO |
---|
| 1389 | ENDDO |
---|
| 1390 | |
---|
[4429] | 1391 | END SUBROUTINE transpose_zyd |
---|
[1] | 1392 | #endif |
---|