Changeset 4649 for palm/trunk/SOURCE/poisfft_mod.f90
- Timestamp:
- Aug 25, 2020 12:11:17 PM (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SOURCE/poisfft_mod.f90
r4429 r4649 1 1 !> @file poisfft_mod.f90 2 !------------------------------------------------------------------------------ !2 !--------------------------------------------------------------------------------------------------! 3 3 ! This file is part of the PALM model system. 4 4 ! 5 ! PALM is free software: you can redistribute it and/or modify it under the 6 ! terms of the GNU General Public License as published by the Free Software 7 ! Foundation, either version 3 of the License, or (at your option) any later 8 ! version. 9 ! 10 ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY 11 ! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 12 ! A PARTICULAR PURPOSE. See the GNU General Public License for more details. 13 ! 14 ! You should have received a copy of the GNU General Public License along with 15 ! PALM. If not, see <http://www.gnu.org/licenses/>. 5 ! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General 6 ! Public License as published by the Free Software Foundation, either version 3 of the License, or 7 ! (at your option) any later version. 8 ! 9 ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the 10 ! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General 11 ! Public License for more details. 12 ! 13 ! You should have received a copy of the GNU General Public License along with PALM. If not, see 14 ! <http://www.gnu.org/licenses/>. 16 15 ! 17 16 ! Copyright 1997-2020 Leibniz Universitaet Hannover 18 !------------------------------------------------------------------------------! 17 !--------------------------------------------------------------------------------------------------! 18 ! 19 19 ! 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 23 ! 22 ! 23 ! 24 24 ! Former revisions: 25 25 ! ----------------- 26 26 ! $Id$ 27 ! File re-formatted to follow the PALM coding standard 28 ! 29 ! 30 ! 4429 2020-02-27 15:24:30Z raasch 27 31 ! Statements added to avoid compile errors due to unused dummy arguments in serial mode 28 ! 32 ! 29 33 ! 4366 2020-01-09 08:12:43Z raasch 30 ! modification concerning NEC vectorizatio31 ! 34 ! Modification concerning NEC vectorizatio 35 ! 32 36 ! 4360 2020-01-07 11:25:50Z suehring 33 37 ! Corrected "Former revisions" section 34 ! 38 ! 35 39 ! 3690 2019-01-22 22:56:42Z knoop 36 40 ! OpenACC port for SPEC … … 40 44 ! 41 45 ! 46 !--------------------------------------------------------------------------------------------------! 42 47 ! Description: 43 48 ! ------------ … … 46 51 !> 47 52 !> Input: 48 !> real ar contains (nnz,nny,nnx) elements of the velocity divergence, 49 !> starting from (1,nys,nxl) 53 !> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl) 50 54 !> 51 55 !> Output: 52 !> real 53 !------------------------------------------------------------------------------ !56 !> real ar contains the solution for perturbation pressure p 57 !--------------------------------------------------------------------------------------------------! 54 58 MODULE poisfft_mod 55 56 57 USE fft_xy, & 58 ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m, & 59 60 61 USE fft_xy, & 62 ONLY: fft_init, & 63 fft_y, & 64 fft_y_1d, & 65 fft_y_m, & 66 fft_x, & 67 fft_x_1d, & 68 fft_x_m, & 59 69 temperton_fft_vec 60 70 61 USE indices, & 62 ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz 63 64 USE transpose_indices, & 65 ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, & 66 nzb_y, nzt_x, nzt_y 67 68 USE tridia_solver, & 69 ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap 71 USE indices, & 72 ONLY: nnx, & 73 nny, & 74 nx, & 75 nxl, & 76 nxr, & 77 ny, & 78 nys, & 79 nyn, & 80 nz 81 82 USE transpose_indices, & 83 ONLY: nxl_y, & 84 nxl_z, & 85 nxr_y, & 86 nxr_z, & 87 nys_x, & 88 nys_z, & 89 nyn_x, & 90 nyn_z, & 91 nzb_x, & 92 nzb_y, & 93 nzt_x, & 94 nzt_y 95 96 USE tridia_solver, & 97 ONLY: tridia_1dd, & 98 tridia_init, & 99 tridia_substi, & 100 tridia_substi_overlap 70 101 71 102 IMPLICIT NONE 72 103 73 LOGICAL, SAVE :: poisfft_initialized = .FALSE. 104 LOGICAL, SAVE :: poisfft_initialized = .FALSE. !< 74 105 75 106 PRIVATE … … 88 119 CONTAINS 89 120 90 !------------------------------------------------------------------------------ !121 !--------------------------------------------------------------------------------------------------! 91 122 ! Description: 92 123 ! ------------ 93 124 !> Setup coefficients for FFT and the tridiagonal solver 94 !------------------------------------------------------------------------------ !95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 !------------------------------------------------------------------------------ !125 !--------------------------------------------------------------------------------------------------! 126 SUBROUTINE poisfft_init 127 128 IMPLICIT NONE 129 130 131 CALL fft_init 132 133 CALL tridia_init 134 135 poisfft_initialized = .TRUE. 136 137 END SUBROUTINE poisfft_init 138 139 140 141 !--------------------------------------------------------------------------------------------------! 111 142 ! Description: 112 143 ! ------------ 113 144 !> Two-dimensional Fourier Transformation in x- and y-direction. 114 !------------------------------------------------------------------------------! 115 SUBROUTINE poisfft( ar ) 116 117 USE control_parameters, & 118 ONLY: transpose_compute_overlap 119 120 USE cpulog, & 121 ONLY: cpu_log, cpu_log_nowait, log_point_s 122 123 USE kinds 124 125 USE pegrid 126 127 IMPLICIT NONE 128 129 INTEGER(iwp) :: ii !< 130 INTEGER(iwp) :: iind !< 131 INTEGER(iwp) :: inew !< 132 INTEGER(iwp) :: jj !< 133 INTEGER(iwp) :: jind !< 134 INTEGER(iwp) :: jnew !< 135 INTEGER(iwp) :: ki !< 136 INTEGER(iwp) :: kk !< 137 INTEGER(iwp) :: knew !< 138 INTEGER(iwp) :: n !< 139 INTEGER(iwp) :: nblk !< 140 INTEGER(iwp) :: nnx_y !< 141 INTEGER(iwp) :: nny_z !< 142 INTEGER(iwp) :: nnz_x !< 143 INTEGER(iwp) :: nxl_y_bound !< 144 INTEGER(iwp) :: nxr_y_bound !< 145 146 INTEGER(iwp), DIMENSION(4) :: isave !< 147 148 REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !< 149 REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !< 145 !--------------------------------------------------------------------------------------------------! 146 SUBROUTINE poisfft( ar ) 147 148 USE control_parameters, & 149 ONLY: transpose_compute_overlap 150 151 USE cpulog, & 152 ONLY: cpu_log, & 153 cpu_log_nowait, & 154 log_point_s 155 156 USE kinds 157 158 USE pegrid 159 160 IMPLICIT NONE 161 162 INTEGER(iwp) :: ii !< 163 INTEGER(iwp) :: iind !< 164 INTEGER(iwp) :: inew !< 165 INTEGER(iwp) :: jj !< 166 INTEGER(iwp) :: jind !< 167 INTEGER(iwp) :: jnew !< 168 INTEGER(iwp) :: ki !< 169 INTEGER(iwp) :: kk !< 170 INTEGER(iwp) :: knew !< 171 INTEGER(iwp) :: n !< 172 INTEGER(iwp) :: nblk !< 173 INTEGER(iwp) :: nnx_y !< 174 INTEGER(iwp) :: nny_z !< 175 INTEGER(iwp) :: nnz_x !< 176 INTEGER(iwp) :: nxl_y_bound !< 177 INTEGER(iwp) :: nxr_y_bound !< 178 179 INTEGER(iwp), DIMENSION(4) :: isave !< 180 181 REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !< 182 REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !< 150 183 151 184 #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) ) 152 185 #if __acc_fft_device 153 186 !$ACC DECLARE CREATE(ar_inv) 154 187 #endif 155 188 156 157 158 159 160 161 162 163 164 165 189 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !< 190 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !< 191 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !< 192 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !< 193 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !< 194 195 196 CALL cpu_log( log_point_s(3), 'poisfft', 'start' ) 197 198 IF ( .NOT. poisfft_initialized ) CALL poisfft_init 166 199 167 200 #if !__acc_fft_device 168 201 !$ACC UPDATE HOST(ar) 169 202 #endif 170 203 171 204 #ifndef _OPENACC 172 205 ! 173 !-- 174 175 176 ! 177 !-- 178 !-- 179 180 181 ! 182 !-- 183 184 185 ! 186 !-- 187 188 189 190 191 ! 192 !-- 193 !-- 194 195 196 ! 197 !-- 198 199 200 ! 201 !-- 202 203 204 206 !-- Two-dimensional Fourier Transformation in x- and y-direction. 207 IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN 208 209 ! 210 !-- 1d-domain-decomposition along x: 211 !-- FFT along y and transposition y --> x 212 CALL ffty_tr_yx( ar, ar ) 213 214 ! 215 !-- FFT along x, solving the tridiagonal system and backward FFT 216 CALL fftx_tri_fftx( ar ) 217 218 ! 219 !-- Transposition x --> y and backward FFT along y 220 CALL tr_xy_ffty( ar, ar ) 221 222 ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN 223 224 ! 225 !-- 1d-domain-decomposition along y: 226 !-- FFT along x and transposition x --> y 227 CALL fftx_tr_xy( ar, ar ) 228 229 ! 230 !-- FFT along y, solving the tridiagonal system and backward FFT 231 CALL ffty_tri_ffty( ar ) 232 233 ! 234 !-- Transposition y --> x and backward FFT along x 235 CALL tr_yx_fftx( ar, ar ) 236 237 ELSEIF ( .NOT. transpose_compute_overlap ) THEN 205 238 #endif 206 239 207 240 ! 208 !-- 2d-domain-decomposition or no decomposition (1 PE run) 209 !-- Transposition z --> x 210 CALL cpu_log( log_point_s(5), 'transpo forward', 'start' ) 211 CALL resort_for_zx( ar, ar_inv ) 212 CALL transpose_zx( ar_inv, ar ) 241 !-- 2d-domain-decomposition or no decomposition (1 PE run) 242 !-- Transposition z --> x 243 CALL cpu_log( log_point_s(5), 'transpo forward', 'start' ) 244 CALL resort_for_zx( ar, ar_inv ) 245 CALL transpose_zx( ar_inv, ar ) 246 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 247 248 CALL cpu_log( log_point_s(4), 'fft_x', 'start' ) 249 IF ( temperton_fft_vec ) THEN 250 ! 251 !-- Vector version outputs a transformed array ar_inv that does not require resorting 252 !-- (which is done for ar further below) 253 CALL fft_x( ar, 'forward', ar_inv=ar_inv) 254 ELSE 255 CALL fft_x( ar, 'forward') 256 ENDIF 257 CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) 258 259 ! 260 !-- Transposition x --> y 261 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) 262 IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv ) 263 CALL transpose_xy( ar_inv, ar ) 264 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 265 266 CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) 267 IF ( temperton_fft_vec ) THEN 268 ! 269 !-- Input array ar_inv from fft_x can be directly used here. 270 !-- The output (also in array ar_inv) does not require resorting below. 271 CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 272 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 273 ELSE 274 CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 275 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 276 ENDIF 277 CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) 278 279 ! 280 !-- Transposition y --> z 281 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) 282 IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv ) 283 CALL transpose_yz( ar_inv, ar ) 284 CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' ) 285 286 ! 287 !-- Solve the tridiagonal equation system along z 288 CALL cpu_log( log_point_s(6), 'tridia', 'start' ) 289 CALL tridia_substi( ar ) 290 CALL cpu_log( log_point_s(6), 'tridia', 'stop' ) 291 292 ! 293 !-- Inverse Fourier Transformation 294 !-- Transposition z --> y 295 CALL cpu_log( log_point_s(8), 'transpo invers', 'start' ) 296 CALL transpose_zy( ar, ar_inv ) 297 ! 298 !-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a 299 !-- resorting) 300 IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar ) 301 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 302 303 CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) 304 IF ( temperton_fft_vec ) THEN 305 ! 306 !-- Output array ar_inv can be used as input to the below fft_x routine without resorting 307 CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 308 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 309 ELSE 310 CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 311 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 312 ENDIF 313 314 CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) 315 316 ! 317 !-- Transposition y --> x 318 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) 319 CALL transpose_yx( ar, ar_inv ) 320 IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar ) 321 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 322 323 CALL cpu_log( log_point_s(4), 'fft_x', 'continue' ) 324 IF ( temperton_fft_vec ) THEN 325 CALL fft_x( ar, 'backward', ar_inv=ar_inv ) 326 ELSE 327 CALL fft_x( ar, 'backward' ) 328 ENDIF 329 CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) 330 331 ! 332 !-- Transposition x --> z 333 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) 334 CALL transpose_xz( ar, ar_inv ) 335 CALL resort_for_xz( ar_inv, ar ) 336 CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' ) 337 338 #ifndef _OPENACC 339 ELSE 340 341 ! 342 !-- 2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft 343 !-- cputime logging must not use barriers, which would prevent overlapping 344 ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 345 f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ) 346 ! 347 !-- Transposition z --> x + subsequent fft along x 348 ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) ) 349 CALL resort_for_zx( ar, f_inv ) 350 ! 351 !-- Save original indices and gridpoint counter 352 isave(1) = nz 353 isave(2) = nzb_x 354 isave(3) = nzt_x 355 isave(4) = sendrecvcount_zx 356 ! 357 !-- Set new indices for transformation 358 nblk = nz / pdims(1) 359 nz = pdims(1) 360 nnz_x = 1 361 nzb_x = 1 + myidx * nnz_x 362 nzt_x = ( myidx + 1 ) * nnz_x 363 sendrecvcount_zx = nnx * nny * nnz_x 364 365 ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ) 366 ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) ) 367 368 DO kk = 1, nblk 369 370 IF ( kk == 1 ) THEN 371 CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait ) 372 ELSE 373 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait ) 374 ENDIF 375 376 DO knew = 1, nz 377 ki = kk + nblk * ( knew - 1 ) 378 f_in(:,:,knew) = f_inv(:,:,ki) 379 ENDDO 380 381 CALL transpose_zx( f_in, ar1(:,:,:)) 213 382 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 214 383 215 CALL cpu_log( log_point_s(4), 'fft_x', 'start' ) 216 IF ( temperton_fft_vec ) THEN 217 ! 218 !-- Vector version outputs a transformed array ar_inv that does not require resorting 219 !-- (which is done for ar further below) 220 CALL fft_x( ar, 'forward', ar_inv=ar_inv) 384 IF ( kk == 1 ) THEN 385 CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait ) 221 386 ELSE 222 CALL fft_x( ar, 'forward')387 CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait ) 223 388 ENDIF 389 390 n = isave(2) + kk - 1 391 CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n) ) 224 392 CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) 225 393 226 ! 227 !-- Transposition x --> y 228 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) 229 IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv ) 230 CALL transpose_xy( ar_inv, ar ) 394 ENDDO 395 ! 396 !-- Restore original indices/counters 397 nz = isave(1) 398 nzb_x = isave(2) 399 nzt_x = isave(3) 400 sendrecvcount_zx = isave(4) 401 402 DEALLOCATE( ar1, f_in, f_inv ) 403 404 ! 405 !-- Transposition x --> y + subsequent fft along y 406 ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 407 CALL resort_for_xy( f_out_z, f_inv ) 408 ! 409 !-- Save original indices and gridpoint counter 410 isave(1) = nx 411 isave(2) = nxl_y 412 isave(3) = nxr_y 413 isave(4) = sendrecvcount_xy 414 ! 415 !-- Set new indices for transformation 416 nblk = ( ( nx+1 ) / pdims(2) ) - 1 417 nx = pdims(2) 418 nnx_y = 1 419 nxl_y = myidy * nnx_y 420 nxr_y = ( myidy + 1 ) * nnx_y - 1 421 sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 ) 422 423 ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ) 424 ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 425 426 DO ii = 0, nblk 427 428 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait ) 429 430 DO inew = 0, nx-1 431 iind = ii + ( nblk + 1 ) * inew 432 f_in(:,:,inew) = f_inv(:,:,iind) 433 ENDDO 434 435 CALL transpose_xy( f_in, ar1(:,:,:) ) 436 231 437 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 232 438 233 CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) 234 IF ( temperton_fft_vec ) THEN 235 ! 236 !-- Input array ar_inv from fft_x can be directly used here. 237 !-- The output (also in array ar_inv) does not require resorting below. 238 CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 239 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 439 IF ( ii == 1 ) THEN 440 CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait ) 240 441 ELSE 241 CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 242 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 442 CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait ) 243 443 ENDIF 444 445 nxl_y_bound = isave(2) 446 nxr_y_bound = isave(3) 447 n = isave(2) + ii 448 CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, & 449 nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n ) 450 244 451 CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) 245 452 246 ! 453 ENDDO 454 ! 455 !-- Restore original indices/counters 456 nx = isave(1) 457 nxl_y = isave(2) 458 nxr_y = isave(3) 459 sendrecvcount_xy = isave(4) 460 461 DEALLOCATE( ar1, f_in, f_inv ) 462 463 ! 464 !-- Transposition y --> z + subsequent tridia + resort for z --> y 465 ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) ) 466 CALL resort_for_yz( f_out_y, f_inv ) 467 ! 468 !-- Save original indices and gridpoint counter 469 isave(1) = ny 470 isave(2) = nys_z 471 isave(3) = nyn_z 472 isave(4) = sendrecvcount_yz 473 ! 474 !-- Set new indices for transformation 475 nblk = ( ( ny+1 ) / pdims(1) ) - 1 476 ny = pdims(1) 477 nny_z = 1 478 nys_z = myidx * nny_z 479 nyn_z = ( myidx + 1 ) * nny_z - 1 480 sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 ) 481 482 ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) ) 483 ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) ) 484 485 DO jj = 0, nblk 486 ! 487 !-- Forward Fourier Transformation 247 488 !-- Transposition y --> z 248 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) 249 IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv ) 250 CALL transpose_yz( ar_inv, ar ) 251 CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' ) 489 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait ) 490 491 DO jnew = 0, ny-1 492 jind = jj + ( nblk + 1 ) * jnew 493 f_in(:,:,jnew) = f_inv(:,:,jind) 494 ENDDO 495 496 CALL transpose_yz( f_in, ar1(:,:,:) ) 497 498 IF ( jj == nblk ) THEN 499 CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' ) 500 ELSE 501 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 502 ENDIF 252 503 253 504 ! 254 505 !-- Solve the tridiagonal equation system along z 255 CALL cpu_log( log_point_s(6), 'tridia', 'start' ) 256 CALL tridia_substi( ar ) 506 CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait ) 507 508 n = isave(2) + jj 509 CALL tridia_substi_overlap( ar1(:,:,:), n ) 510 257 511 CALL cpu_log( log_point_s(6), 'tridia', 'stop' ) 258 512 … … 260 514 !-- Inverse Fourier Transformation 261 515 !-- Transposition z --> y 262 CALL cpu_log( log_point_s(8), 'transpo invers', 'start' ) 263 CALL transpose_zy( ar, ar_inv ) 264 ! 265 !-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a 266 !-- resorting) 267 IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar ) 516 !-- Only one thread should call MPI routines, therefore forward and backward tranpose are in 517 !-- the same section 518 IF ( jj == 0 ) THEN 519 CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait ) 520 ELSE 521 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait ) 522 ENDIF 523 524 CALL transpose_zy( ar1(:,:,:), f_in ) 525 526 DO jnew = 0, ny-1 527 jind = jj + ( nblk + 1 ) * jnew 528 f_inv(:,:,jind) = f_in(:,:,jnew) 529 ENDDO 530 268 531 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 269 532 270 CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) 271 IF ( temperton_fft_vec ) THEN 272 ! 273 !-- Output array ar_inv can be used as input to the below fft_x routine without resorting 274 CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,& 275 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 533 ENDDO 534 ! 535 !-- Restore original indices/counters 536 ny = isave(1) 537 nys_z = isave(2) 538 nyn_z = isave(3) 539 sendrecvcount_yz = isave(4) 540 541 CALL resort_for_zy( f_inv, f_out_y ) 542 543 DEALLOCATE( ar1, f_in, f_inv ) 544 545 ! 546 !-- fft along y backward + subsequent transposition y --> x 547 ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 548 ! 549 !-- Save original indices and gridpoint counter 550 isave(1) = nx 551 isave(2) = nxl_y 552 isave(3) = nxr_y 553 isave(4) = sendrecvcount_xy 554 ! 555 !-- Set new indices for transformation 556 nblk = ( ( nx+1 ) / pdims(2) ) - 1 557 nx = pdims(2) 558 nnx_y = 1 559 nxl_y = myidy * nnx_y 560 nxr_y = ( myidy + 1 ) * nnx_y - 1 561 sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 ) 562 563 ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ) 564 ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 565 566 DO ii = 0, nblk 567 568 CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait ) 569 570 n = isave(2) + ii 571 nxl_y_bound = isave(2) 572 nxr_y_bound = isave(3) 573 574 CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, & 575 nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n ) 576 577 IF ( ii == nblk ) THEN 578 CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) 276 579 ELSE 277 CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 278 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 580 CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) 279 581 ENDIF 280 582 281 CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) 282 283 ! 284 !-- Transposition y --> x 285 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) 286 CALL transpose_yx( ar, ar_inv ) 287 IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar ) 583 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait ) 584 585 CALL transpose_yx( ar1(:,:,:), f_in ) 586 587 DO inew = 0, nx-1 588 iind = ii + (nblk+1) * inew 589 f_inv(:,:,iind) = f_in(:,:,inew) 590 ENDDO 591 288 592 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 289 593 290 CALL cpu_log( log_point_s(4), 'fft_x', 'continue' ) 291 IF ( temperton_fft_vec ) THEN 292 CALL fft_x( ar, 'backward', ar_inv=ar_inv ) 594 ENDDO 595 ! 596 !-- Restore original indices/counters 597 nx = isave(1) 598 nxl_y = isave(2) 599 nxr_y = isave(3) 600 sendrecvcount_xy = isave(4) 601 602 CALL resort_for_yx( f_inv, f_out_z ) 603 604 DEALLOCATE( ar1, f_in, f_inv ) 605 606 ! 607 !-- fft along x backward + subsequent final transposition x --> z 608 ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) ) 609 ! 610 !-- Save original indices and gridpoint counter 611 isave(1) = nz 612 isave(2) = nzb_x 613 isave(3) = nzt_x 614 isave(4) = sendrecvcount_zx 615 ! 616 !-- Set new indices for transformation 617 nblk = nz / pdims(1) 618 nz = pdims(1) 619 nnz_x = 1 620 nzb_x = 1 + myidx * nnz_x 621 nzt_x = ( myidx + 1 ) * nnz_x 622 sendrecvcount_zx = nnx * nny * nnz_x 623 624 ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ) 625 ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) ) 626 627 DO kk = 1, nblk 628 629 CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait ) 630 631 n = isave(2) + kk - 1 632 CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) ) 633 634 IF ( kk == nblk ) THEN 635 CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) 293 636 ELSE 294 CALL fft_x( ar, 'backward' )637 CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) 295 638 ENDIF 296 CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) 297 298 ! 299 !-- Transposition x --> z 300 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) 301 CALL transpose_xz( ar, ar_inv ) 302 CALL resort_for_xz( ar_inv, ar ) 303 CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' ) 304 305 #ifndef _OPENACC 306 ELSE 307 308 ! 309 !-- 2d-domain-decomposition or no decomposition (1 PE run) with 310 !-- overlapping transposition / fft 311 !-- cputime logging must not use barriers, which would prevent overlapping 312 ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 313 f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ) 314 ! 315 !-- Transposition z --> x + subsequent fft along x 316 ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) ) 317 CALL resort_for_zx( ar, f_inv ) 318 ! 319 !-- Save original indices and gridpoint counter 320 isave(1) = nz 321 isave(2) = nzb_x 322 isave(3) = nzt_x 323 isave(4) = sendrecvcount_zx 324 ! 325 !-- Set new indices for transformation 326 nblk = nz / pdims(1) 327 nz = pdims(1) 328 nnz_x = 1 329 nzb_x = 1 + myidx * nnz_x 330 nzt_x = ( myidx + 1 ) * nnz_x 331 sendrecvcount_zx = nnx * nny * nnz_x 332 333 ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ) 334 ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) ) 335 336 DO kk = 1, nblk 337 338 IF ( kk == 1 ) THEN 339 CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait ) 340 ELSE 341 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait ) 342 ENDIF 343 344 DO knew = 1, nz 345 ki = kk + nblk * ( knew - 1 ) 346 f_in(:,:,knew) = f_inv(:,:,ki) 347 ENDDO 348 349 CALL transpose_zx( f_in, ar1(:,:,:)) 350 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 351 352 IF ( kk == 1 ) THEN 353 CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait ) 354 ELSE 355 CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait ) 356 ENDIF 357 358 n = isave(2) + kk - 1 359 CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n)) 360 CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) 361 362 ENDDO 363 ! 364 !-- Restore original indices/counters 365 nz = isave(1) 366 nzb_x = isave(2) 367 nzt_x = isave(3) 368 sendrecvcount_zx = isave(4) 369 370 DEALLOCATE( ar1, f_in, f_inv ) 371 372 ! 373 !-- Transposition x --> y + subsequent fft along y 374 ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 375 CALL resort_for_xy( f_out_z, f_inv ) 376 ! 377 !-- Save original indices and gridpoint counter 378 isave(1) = nx 379 isave(2) = nxl_y 380 isave(3) = nxr_y 381 isave(4) = sendrecvcount_xy 382 ! 383 !-- Set new indices for transformation 384 nblk = ( ( nx+1 ) / pdims(2) ) - 1 385 nx = pdims(2) 386 nnx_y = 1 387 nxl_y = myidy * nnx_y 388 nxr_y = ( myidy + 1 ) * nnx_y - 1 389 sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 ) 390 391 ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ) 392 ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 393 394 DO ii = 0, nblk 395 396 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait ) 397 398 DO inew = 0, nx-1 399 iind = ii + ( nblk + 1 ) * inew 400 f_in(:,:,inew) = f_inv(:,:,iind) 401 ENDDO 402 403 CALL transpose_xy( f_in, ar1(:,:,:) ) 404 405 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 406 407 IF ( ii == 1 ) THEN 408 CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait ) 409 ELSE 410 CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait ) 411 ENDIF 412 413 nxl_y_bound = isave(2) 414 nxr_y_bound = isave(3) 415 n = isave(2) + ii 416 CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, & 417 nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, & 418 nxl_y_l = n, nxr_y_l = n ) 419 420 CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) 421 422 ENDDO 423 ! 424 !-- Restore original indices/counters 425 nx = isave(1) 426 nxl_y = isave(2) 427 nxr_y = isave(3) 428 sendrecvcount_xy = isave(4) 429 430 DEALLOCATE( ar1, f_in, f_inv ) 431 432 ! 433 !-- Transposition y --> z + subsequent tridia + resort for z --> y 434 ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) ) 435 CALL resort_for_yz( f_out_y, f_inv ) 436 ! 437 !-- Save original indices and gridpoint counter 438 isave(1) = ny 439 isave(2) = nys_z 440 isave(3) = nyn_z 441 isave(4) = sendrecvcount_yz 442 ! 443 !-- Set new indices for transformation 444 nblk = ( ( ny+1 ) / pdims(1) ) - 1 445 ny = pdims(1) 446 nny_z = 1 447 nys_z = myidx * nny_z 448 nyn_z = ( myidx + 1 ) * nny_z - 1 449 sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 ) 450 451 ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) ) 452 ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) ) 453 454 DO jj = 0, nblk 455 ! 456 !-- Forward Fourier Transformation 457 !-- Transposition y --> z 458 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait ) 459 460 DO jnew = 0, ny-1 461 jind = jj + ( nblk + 1 ) * jnew 462 f_in(:,:,jnew) = f_inv(:,:,jind) 463 ENDDO 464 465 CALL transpose_yz( f_in, ar1(:,:,:) ) 466 467 IF ( jj == nblk ) THEN 468 CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' ) 469 ELSE 470 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 471 ENDIF 472 473 ! 474 !-- Solve the tridiagonal equation system along z 475 CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait ) 476 477 n = isave(2) + jj 478 CALL tridia_substi_overlap( ar1(:,:,:), n ) 479 480 CALL cpu_log( log_point_s(6), 'tridia', 'stop' ) 481 482 ! 483 !-- Inverse Fourier Transformation 484 !-- Transposition z --> y 485 !-- Only one thread should call MPI routines, therefore forward and 486 !-- backward tranpose are in the same section 487 IF ( jj == 0 ) THEN 488 CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait ) 489 ELSE 490 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait ) 491 ENDIF 492 493 CALL transpose_zy( ar1(:,:,:), f_in ) 494 495 DO jnew = 0, ny-1 496 jind = jj + ( nblk + 1 ) * jnew 497 f_inv(:,:,jind) = f_in(:,:,jnew) 498 ENDDO 499 639 640 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait ) 641 642 CALL transpose_xz( ar1(:,:,:), f_in ) 643 644 DO knew = 1, nz 645 ki = kk + nblk * ( knew - 1 ) 646 f_inv(:,:,ki) = f_in(:,:,knew) 647 ENDDO 648 649 IF ( kk == nblk ) THEN 650 CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' ) 651 ELSE 500 652 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 501 502 ENDDO 503 ! 504 !-- Restore original indices/counters 505 ny = isave(1) 506 nys_z = isave(2) 507 nyn_z = isave(3) 508 sendrecvcount_yz = isave(4) 509 510 CALL resort_for_zy( f_inv, f_out_y ) 511 512 DEALLOCATE( ar1, f_in, f_inv ) 513 514 ! 515 !-- fft along y backward + subsequent transposition y --> x 516 ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 517 ! 518 !-- Save original indices and gridpoint counter 519 isave(1) = nx 520 isave(2) = nxl_y 521 isave(3) = nxr_y 522 isave(4) = sendrecvcount_xy 523 ! 524 !-- Set new indices for transformation 525 nblk = (( nx+1 ) / pdims(2) ) - 1 526 nx = pdims(2) 527 nnx_y = 1 528 nxl_y = myidy * nnx_y 529 nxr_y = ( myidy + 1 ) * nnx_y - 1 530 sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 ) 531 532 ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ) 533 ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) ) 534 535 DO ii = 0, nblk 536 537 CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait ) 538 539 n = isave(2) + ii 540 nxl_y_bound = isave(2) 541 nxr_y_bound = isave(3) 542 543 CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, & 544 nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, & 545 nxl_y_l = n, nxr_y_l = n ) 546 547 IF ( ii == nblk ) THEN 548 CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) 549 ELSE 550 CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) 551 ENDIF 552 553 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait ) 554 555 CALL transpose_yx( ar1(:,:,:), f_in ) 556 557 DO inew = 0, nx-1 558 iind = ii + (nblk+1) * inew 559 f_inv(:,:,iind) = f_in(:,:,inew) 560 ENDDO 561 562 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 563 564 ENDDO 565 ! 566 !-- Restore original indices/counters 567 nx = isave(1) 568 nxl_y = isave(2) 569 nxr_y = isave(3) 570 sendrecvcount_xy = isave(4) 571 572 CALL resort_for_yx( f_inv, f_out_z ) 573 574 DEALLOCATE( ar1, f_in, f_inv ) 575 576 ! 577 !-- fft along x backward + subsequent final transposition x --> z 578 ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) ) 579 ! 580 !-- Save original indices and gridpoint counter 581 isave(1) = nz 582 isave(2) = nzb_x 583 isave(3) = nzt_x 584 isave(4) = sendrecvcount_zx 585 ! 586 !-- Set new indices for transformation 587 nblk = nz / pdims(1) 588 nz = pdims(1) 589 nnz_x = 1 590 nzb_x = 1 + myidx * nnz_x 591 nzt_x = ( myidx + 1 ) * nnz_x 592 sendrecvcount_zx = nnx * nny * nnz_x 593 594 ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ) 595 ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) ) 596 597 DO kk = 1, nblk 598 599 CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait ) 600 601 n = isave(2) + kk - 1 602 CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n)) 603 604 IF ( kk == nblk ) THEN 605 CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) 606 ELSE 607 CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) 608 ENDIF 609 610 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait ) 611 612 CALL transpose_xz( ar1(:,:,:), f_in ) 613 614 DO knew = 1, nz 615 ki = kk + nblk * (knew-1) 616 f_inv(:,:,ki) = f_in(:,:,knew) 617 ENDDO 618 619 IF ( kk == nblk ) THEN 620 CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' ) 621 ELSE 622 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 623 ENDIF 624 625 ENDDO 626 ! 627 !-- Restore original indices/counters 628 nz = isave(1) 629 nzb_x = isave(2) 630 nzt_x = isave(3) 631 sendrecvcount_zx = isave(4) 632 633 CALL resort_for_xz( f_inv, ar ) 634 635 DEALLOCATE( ar1, f_in, f_inv ) 636 637 ENDIF 653 ENDIF 654 655 ENDDO 656 ! 657 !-- Restore original indices/counters 658 nz = isave(1) 659 nzb_x = isave(2) 660 nzt_x = isave(3) 661 sendrecvcount_zx = isave(4) 662 663 CALL resort_for_xz( f_inv, ar ) 664 665 DEALLOCATE( ar1, f_in, f_inv ) 666 667 ENDIF 638 668 #endif 639 669 640 670 #if !__acc_fft_device 641 671 !$ACC UPDATE DEVICE(ar) 642 672 #endif 643 673 644 645 646 647 648 649 !------------------------------------------------------------------------------ !674 CALL cpu_log( log_point_s(3), 'poisfft', 'stop' ) 675 676 END SUBROUTINE poisfft 677 678 679 !--------------------------------------------------------------------------------------------------! 650 680 ! Description: 651 681 ! ------------ 652 !> Fourier-transformation along y with subsequent transposition y --> x for 653 !> a 1d-decomposition along x.682 !> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition 683 !> along x. 654 684 !> 655 !> @attention The performance of this routine is much faster on the NEC-SX6, 656 !> if the first index of work_ffty_vec is odd. Otherwise 657 !> memory bank conflicts may occur (especially if the index is a 658 !> multiple of 128). That's why work_ffty_vec is dimensioned as 659 !> 0:ny+1. 660 !> Of course, this will not work if users are using an odd number 661 !> of gridpoints along y. 662 !------------------------------------------------------------------------------! 663 SUBROUTINE ffty_tr_yx( f_in, f_out ) 664 665 USE control_parameters, & 666 ONLY: loop_optimization 667 668 USE cpulog, & 669 ONLY: cpu_log, log_point_s 670 671 USE kinds 672 673 USE pegrid 674 675 IMPLICIT NONE 676 677 INTEGER(iwp) :: i !< 678 INTEGER(iwp) :: iend !< 679 INTEGER(iwp) :: iouter !< 680 INTEGER(iwp) :: ir !< 681 INTEGER(iwp) :: j !< 682 INTEGER(iwp) :: k !< 683 684 INTEGER(iwp), PARAMETER :: stridex = 4 !< 685 686 REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !< 687 REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !< 688 REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !< 689 690 REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !< 691 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !< 692 693 ! 694 !-- Carry out the FFT along y, where all data are present due to the 695 !-- 1d-decomposition along x. Resort the data in a way that x becomes 696 !-- the first index. 697 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' ) 698 699 IF ( loop_optimization == 'vector' ) THEN 700 701 ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) ) 702 ! 703 !-- Code optimized for vector processors 704 !$OMP PARALLEL PRIVATE ( i, j, k ) 705 !$OMP DO 706 DO i = nxl, nxr 707 685 !> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of 686 !> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the 687 !> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1. 688 !> Of course, this will not work if users are using an odd number of gridpoints along y. 689 !--------------------------------------------------------------------------------------------------! 690 SUBROUTINE ffty_tr_yx( f_in, f_out ) 691 692 USE control_parameters, & 693 ONLY: loop_optimization 694 695 USE cpulog, & 696 ONLY: cpu_log, & 697 log_point_s 698 699 USE kinds 700 701 USE pegrid 702 703 IMPLICIT NONE 704 705 INTEGER(iwp) :: i !< 706 INTEGER(iwp) :: iend !< 707 INTEGER(iwp) :: iouter !< 708 INTEGER(iwp) :: ir !< 709 INTEGER(iwp) :: j !< 710 INTEGER(iwp) :: k !< 711 712 INTEGER(iwp), PARAMETER :: stridex = 4 !< 713 714 REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !< 715 REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !< 716 REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !< 717 718 REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !< 719 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !< 720 721 ! 722 !-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x. 723 !-- Resort the data in a way that x becomes the first index. 724 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' ) 725 726 IF ( loop_optimization == 'vector' ) THEN 727 728 ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) ) 729 ! 730 !-- Code optimized for vector processors 731 !$OMP PARALLEL PRIVATE ( i, j, k ) 732 !$OMP DO 733 DO i = nxl, nxr 734 735 DO j = 0, ny 736 DO k = 1, nz 737 work_ffty_vec(j,k,i) = f_in(k,j,i) 738 ENDDO 739 ENDDO 740 741 CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' ) 742 743 ENDDO 744 745 !$OMP DO 746 DO k = 1, nz 747 DO j = 0, ny 748 DO i = nxl, nxr 749 work(i,k,j) = work_ffty_vec(j,k,i) 750 ENDDO 751 ENDDO 752 ENDDO 753 !$OMP END PARALLEL 754 755 DEALLOCATE( work_ffty_vec ) 756 757 ELSE 758 ! 759 !-- Cache optimized code. 760 ALLOCATE( work_ffty(0:ny,stridex) ) 761 ! 762 !-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache 763 !-- performance 764 !$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty) 765 !$OMP DO 766 DO iouter = nxl, nxr, stridex 767 768 iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop 769 770 DO k = 1, nz 771 772 DO i = iouter, iend 773 774 ir = i-iouter+1 ! Counter within a stride 775 DO j = 0, ny 776 work_ffty(j,ir) = f_in(k,j,i) 777 ENDDO 778 ! 779 !-- FFT along y 780 CALL fft_y_1d( work_ffty(:,ir), 'forward' ) 781 782 ENDDO 783 784 ! 785 !-- Resort 708 786 DO j = 0, ny 709 DO k = 1, nz710 work _ffty_vec(j,k,i) = f_in(k,j,i)787 DO i = iouter, iend 788 work(i,k,j) = work_ffty(j,i-iouter+1) 711 789 ENDDO 712 790 ENDDO 713 791 714 CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' ) 715 716 ENDDO 717 718 !$OMP DO 719 DO k = 1, nz 720 DO j = 0, ny 721 DO i = nxl, nxr 722 work(i,k,j) = work_ffty_vec(j,k,i) 723 ENDDO 724 ENDDO 725 ENDDO 726 !$OMP END PARALLEL 727 728 DEALLOCATE( work_ffty_vec ) 729 730 ELSE 731 ! 732 !-- Cache optimized code. 733 ALLOCATE( work_ffty(0:ny,stridex) ) 734 ! 735 !-- The i-(x-)direction is split into a strided outer loop and an inner 736 !-- loop for better cache performance 737 !$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty) 738 !$OMP DO 739 DO iouter = nxl, nxr, stridex 740 741 iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop 742 743 DO k = 1, nz 744 745 DO i = iouter, iend 746 747 ir = i-iouter+1 ! counter within a stride 748 DO j = 0, ny 749 work_ffty(j,ir) = f_in(k,j,i) 750 ENDDO 751 ! 752 !-- FFT along y 753 CALL fft_y_1d( work_ffty(:,ir), 'forward' ) 754 755 ENDDO 756 757 ! 758 !-- Resort 759 DO j = 0, ny 760 DO i = iouter, iend 761 work(i,k,j) = work_ffty(j,i-iouter+1) 762 ENDDO 763 ENDDO 764 765 ENDDO 766 767 ENDDO 768 !$OMP END PARALLEL 769 770 DEALLOCATE( work_ffty ) 771 772 ENDIF 773 774 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' ) 775 776 ! 777 !-- Transpose array 792 ENDDO 793 794 ENDDO 795 !$OMP END PARALLEL 796 797 DEALLOCATE( work_ffty ) 798 799 ENDIF 800 801 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' ) 802 803 ! 804 !-- Transpose array 778 805 #if defined( __parallel ) 779 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 780 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 781 CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, & 782 f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, & 783 comm1dx, ierr ) 784 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 806 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 807 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 808 CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), & 809 sendrecvcount_xy, MPI_REAL, comm1dx, ierr ) 810 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 785 811 #else 786 812 ! 787 !-- 788 813 !-- Next line required to avoid compile error about unused dummy argument in serial mode 814 i = SIZE( f_out ) 789 815 #endif 790 816 791 792 793 794 !------------------------------------------------------------------------------ !817 END SUBROUTINE ffty_tr_yx 818 819 820 !--------------------------------------------------------------------------------------------------! 795 821 ! Description: 796 822 ! ------------ 797 !> Transposition x --> y with a subsequent backward Fourier transformation for 798 !> a 1d-decomposition along x 799 !------------------------------------------------------------------------------! 800 SUBROUTINE tr_xy_ffty( f_in, f_out ) 801 802 USE control_parameters, & 803 ONLY: loop_optimization 804 805 USE cpulog, & 806 ONLY: cpu_log, log_point_s 807 808 USE kinds 809 810 USE pegrid 811 812 IMPLICIT NONE 813 814 INTEGER(iwp) :: i !< 815 INTEGER(iwp) :: iend !< 816 INTEGER(iwp) :: iouter !< 817 INTEGER(iwp) :: ir !< 818 INTEGER(iwp) :: j !< 819 INTEGER(iwp) :: k !< 820 821 INTEGER(iwp), PARAMETER :: stridex = 4 !< 822 823 REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !< 824 REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !< 825 REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !< 826 827 REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !< 828 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !< 829 830 ! 831 !-- Transpose array 823 !> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition 824 !> along x 825 !--------------------------------------------------------------------------------------------------! 826 SUBROUTINE tr_xy_ffty( f_in, f_out ) 827 828 USE control_parameters, & 829 ONLY: loop_optimization 830 831 USE cpulog, & 832 ONLY: cpu_log, & 833 log_point_s 834 835 USE kinds 836 837 USE pegrid 838 839 IMPLICIT NONE 840 841 INTEGER(iwp) :: i !< 842 INTEGER(iwp) :: iend !< 843 INTEGER(iwp) :: iouter !< 844 INTEGER(iwp) :: ir !< 845 INTEGER(iwp) :: j !< 846 INTEGER(iwp) :: k !< 847 848 INTEGER(iwp), PARAMETER :: stridex = 4 !< 849 850 REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !< 851 REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !< 852 REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !< 853 854 REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !< 855 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !< 856 857 ! 858 !-- Transpose array 832 859 #if defined( __parallel ) 833 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 834 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 835 CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, & 836 work(nxl,1,0), sendrecvcount_xy, MPI_REAL, & 837 comm1dx, ierr ) 838 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 860 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 861 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 862 CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), & 863 sendrecvcount_xy, MPI_REAL, comm1dx, ierr ) 864 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 839 865 #else 840 866 ! 841 !-- 842 867 !-- Next line required to avoid compile error about unused dummy argument in serial mode 868 i = SIZE( f_in ) 843 869 #endif 844 870 845 871 ! 846 !-- Resort the data in a way that y becomes the first index and carry out the 847 !-- backward fft along y. 848 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' ) 849 850 IF ( loop_optimization == 'vector' ) THEN 851 852 ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) ) 853 ! 854 !-- Code optimized for vector processors 855 !$OMP PARALLEL PRIVATE ( i, j, k ) 856 !$OMP DO 872 !-- Resort the data in a way that y becomes the first index and carry out the backward fft along y. 873 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' ) 874 875 IF ( loop_optimization == 'vector' ) THEN 876 877 ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) ) 878 ! 879 !-- Code optimized for vector processors 880 !$OMP PARALLEL PRIVATE ( i, j, k ) 881 !$OMP DO 882 DO k = 1, nz 883 DO j = 0, ny 884 DO i = nxl, nxr 885 work_ffty_vec(j,k,i) = work(i,k,j) 886 ENDDO 887 ENDDO 888 ENDDO 889 890 !$OMP DO 891 DO i = nxl, nxr 892 893 CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' ) 894 895 DO j = 0, ny 896 DO k = 1, nz 897 f_out(k,j,i) = work_ffty_vec(j,k,i) 898 ENDDO 899 ENDDO 900 901 ENDDO 902 !$OMP END PARALLEL 903 904 DEALLOCATE( work_ffty_vec ) 905 906 ELSE 907 ! 908 !-- Cache optimized code. 909 ALLOCATE( work_ffty(0:ny,stridex) ) 910 ! 911 !-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache 912 !-- performance 913 !$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty ) 914 !$OMP DO 915 DO iouter = nxl, nxr, stridex 916 917 iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop 918 857 919 DO k = 1, nz 920 ! 921 !-- Resort 858 922 DO j = 0, ny 859 DO i = nxl, nxr860 work_ffty _vec(j,k,i) = work(i,k,j)923 DO i = iouter, iend 924 work_ffty(j,i-iouter+1) = work(i,k,j) 861 925 ENDDO 862 926 ENDDO 863 ENDDO 864 865 !$OMP DO 866 DO i = nxl, nxr 867 868 CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )869 870 DO j = 0, ny 871 DO k = 1, nz872 f_out(k,j,i) = work_ffty _vec(j,k,i)927 928 DO i = iouter, iend 929 930 ! 931 !-- FFT along y 932 ir = i-iouter+1 ! Counter within a stride 933 CALL fft_y_1d( work_ffty(:,ir), 'backward' ) 934 935 DO j = 0, ny 936 f_out(k,j,i) = work_ffty(j,ir) 873 937 ENDDO 874 938 ENDDO 875 939 876 940 ENDDO 877 !$OMP END PARALLEL 878 879 DEALLOCATE( work_ffty_vec ) 880 881 ELSE 882 ! 883 !-- Cache optimized code. 884 ALLOCATE( work_ffty(0:ny,stridex) ) 885 ! 886 !-- The i-(x-)direction is split into a strided outer loop and an inner 887 !-- loop for better cache performance 888 !$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty ) 889 !$OMP DO 890 DO iouter = nxl, nxr, stridex 891 892 iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop 893 894 DO k = 1, nz 895 ! 896 !-- Resort 897 DO j = 0, ny 898 DO i = iouter, iend 899 work_ffty(j,i-iouter+1) = work(i,k,j) 900 ENDDO 901 ENDDO 902 903 DO i = iouter, iend 904 905 ! 906 !-- FFT along y 907 ir = i-iouter+1 ! counter within a stride 908 CALL fft_y_1d( work_ffty(:,ir), 'backward' ) 909 910 DO j = 0, ny 911 f_out(k,j,i) = work_ffty(j,ir) 912 ENDDO 913 ENDDO 914 915 ENDDO 916 917 ENDDO 918 !$OMP END PARALLEL 919 920 DEALLOCATE( work_ffty ) 921 922 ENDIF 923 924 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' ) 925 926 END SUBROUTINE tr_xy_ffty 927 928 929 !------------------------------------------------------------------------------! 941 942 ENDDO 943 !$OMP END PARALLEL 944 945 DEALLOCATE( work_ffty ) 946 947 ENDIF 948 949 CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' ) 950 951 END SUBROUTINE tr_xy_ffty 952 953 954 !--------------------------------------------------------------------------------------------------! 930 955 ! Description: 931 956 ! ------------ 932 !> FFT along x, solution of the tridiagonal system and backward FFT for 933 !> a 1d-decomposition along x 957 !> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x 934 958 !> 935 !> @warning this subroutine may still not work for hybrid parallelization 936 !> with OpenMP (for possible necessary changes see the original 937 !> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002) 938 !------------------------------------------------------------------------------! 939 SUBROUTINE fftx_tri_fftx( ar ) 940 941 USE control_parameters, & 942 ONLY: loop_optimization 943 944 USE cpulog, & 945 ONLY: cpu_log, log_point_s 946 947 USE grid_variables, & 948 ONLY: ddx2, ddy2 949 950 USE kinds 951 952 USE pegrid 953 954 IMPLICIT NONE 955 956 INTEGER(iwp) :: i !< 957 INTEGER(iwp) :: j !< 958 INTEGER(iwp) :: k !< 959 INTEGER(iwp) :: m !< 960 INTEGER(iwp) :: n !< 961 !$ INTEGER(iwp) :: omp_get_thread_num !< 962 INTEGER(iwp) :: tn !< 963 964 REAL(wp), DIMENSION(0:nx) :: work_fftx !< 965 REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !< 966 REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !< 967 REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !< 968 969 970 CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' ) 971 972 ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) ) 973 974 tn = 0 ! Default thread number in case of one thread 959 !> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible 960 !> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen, 961 !> May 2002) 962 !--------------------------------------------------------------------------------------------------! 963 SUBROUTINE fftx_tri_fftx( ar ) 964 965 USE control_parameters, & 966 ONLY: loop_optimization 967 968 USE cpulog, & 969 ONLY: cpu_log, & 970 log_point_s 971 972 USE grid_variables, & 973 ONLY: ddx2, & 974 ddy2 975 976 USE kinds 977 978 USE pegrid 979 980 IMPLICIT NONE 981 982 INTEGER(iwp) :: i !< 983 INTEGER(iwp) :: j !< 984 INTEGER(iwp) :: k !< 985 INTEGER(iwp) :: m !< 986 INTEGER(iwp) :: n !< 987 ! INTEGER(iwp) :: omp_get_thread_num !< 988 INTEGER(iwp) :: tn !< 989 990 REAL(wp), DIMENSION(0:nx) :: work_fftx !< 991 REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !< 992 REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !< 993 REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !< 994 995 996 CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' ) 997 998 ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) ) 999 1000 tn = 0 ! Default thread number in case of one thread 975 1001 !$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix ) 976 DO j = nys_x, nyn_x 977 978 !$ tn = omp_get_thread_num() 979 980 IF ( loop_optimization == 'vector' ) THEN 981 ! 982 !-- Code optimized for vector processors 983 DO k = 1, nz 984 985 m = 0 986 DO n = 1, pdims(1) 987 DO i = 1, nnx 988 work_trix(m,k) = ar(i,k,j,n) 989 m = m + 1 990 ENDDO 1002 DO j = nys_x, nyn_x 1003 1004 !$ tn = omp_get_thread_num() 1005 1006 IF ( loop_optimization == 'vector' ) THEN 1007 ! 1008 !-- Code optimized for vector processors 1009 DO k = 1, nz 1010 1011 m = 0 1012 DO n = 1, pdims(1) 1013 DO i = 1, nnx 1014 work_trix(m,k) = ar(i,k,j,n) 1015 m = m + 1 991 1016 ENDDO 992 993 ENDDO 994 995 CALL fft_x_m( work_trix, 'forward' ) 996 997 ELSE 998 ! 999 ! -- Cache optimized code1000 DO k = 1, nz 1001 1002 m = 0 1003 DO n = 1, pdims(1)1004 DO i = 1, nnx1005 work_fftx(m) = ar(i,k,j,n)1006 m = m + 11007 ENDDO1017 ENDDO 1018 1019 ENDDO 1020 1021 CALL fft_x_m( work_trix, 'forward' ) 1022 1023 ELSE 1024 ! 1025 !-- Cache optimized code 1026 DO k = 1, nz 1027 1028 m = 0 1029 DO n = 1, pdims(1) 1030 DO i = 1, nnx 1031 work_fftx(m) = ar(i,k,j,n) 1032 m = m + 1 1008 1033 ENDDO 1009 1010 CALL fft_x_1d( work_fftx, 'forward' ) 1011 1012 DO i = 0, nx 1013 work_trix(i,k) = work_fftx(i) 1034 ENDDO 1035 1036 CALL fft_x_1d( work_fftx, 'forward' ) 1037 1038 DO i = 0, nx 1039 work_trix(i,k) = work_fftx(i) 1040 ENDDO 1041 1042 ENDDO 1043 1044 ENDIF 1045 1046 ! 1047 !-- Solve the linear equation system 1048 CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) ) 1049 1050 IF ( loop_optimization == 'vector' ) THEN 1051 ! 1052 !-- Code optimized for vector processors 1053 CALL fft_x_m( work_trix, 'backward' ) 1054 1055 DO k = 1, nz 1056 1057 m = 0 1058 DO n = 1, pdims(1) 1059 DO i = 1, nnx 1060 ar(i,k,j,n) = work_trix(m,k) 1061 m = m + 1 1014 1062 ENDDO 1015 1016 ENDDO 1017 1018 ENDIF 1019 1020 ! 1021 !-- Solve the linear equation system 1022 CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) ) 1023 1024 IF ( loop_optimization == 'vector' ) THEN 1025 ! 1026 !-- Code optimized for vector processors 1027 CALL fft_x_m( work_trix, 'backward' ) 1028 1029 DO k = 1, nz 1030 1031 m = 0 1032 DO n = 1, pdims(1) 1033 DO i = 1, nnx 1034 ar(i,k,j,n) = work_trix(m,k) 1035 m = m + 1 1036 ENDDO 1063 ENDDO 1064 1065 ENDDO 1066 1067 ELSE 1068 ! 1069 !-- Cache optimized code 1070 DO k = 1, nz 1071 1072 DO i = 0, nx 1073 work_fftx(i) = work_trix(i,k) 1074 ENDDO 1075 1076 CALL fft_x_1d( work_fftx, 'backward' ) 1077 1078 m = 0 1079 DO n = 1, pdims(1) 1080 DO i = 1, nnx 1081 ar(i,k,j,n) = work_fftx(m) 1082 m = m + 1 1037 1083 ENDDO 1038 1039 ENDDO 1040 1041 ELSE 1042 ! 1043 !-- Cache optimized code 1044 DO k = 1, nz 1045 1046 DO i = 0, nx 1047 work_fftx(i) = work_trix(i,k) 1048 ENDDO 1049 1050 CALL fft_x_1d( work_fftx, 'backward' ) 1051 1052 m = 0 1053 DO n = 1, pdims(1) 1054 DO i = 1, nnx 1055 ar(i,k,j,n) = work_fftx(m) 1056 m = m + 1 1057 ENDDO 1058 ENDDO 1059 1060 ENDDO 1061 1062 ENDIF 1063 1064 ENDDO 1065 1066 DEALLOCATE( tri ) 1067 1068 CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' ) 1069 1070 END SUBROUTINE fftx_tri_fftx 1071 1072 1073 !------------------------------------------------------------------------------! 1084 ENDDO 1085 1086 ENDDO 1087 1088 ENDIF 1089 1090 ENDDO 1091 1092 DEALLOCATE( tri ) 1093 1094 CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' ) 1095 1096 END SUBROUTINE fftx_tri_fftx 1097 1098 1099 !--------------------------------------------------------------------------------------------------! 1074 1100 ! Description: 1075 1101 ! ------------ 1076 !> Fourier-transformation along x with subsequent transposition x --> y for 1077 !> a 1d-decomposition along y.1102 !> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition 1103 !> along y. 1078 1104 !> 1079 !> @attention NEC-branch of this routine may significantly profit from 1080 !> further optimizations. So far, performance is much worse than 1081 !> for routine ffty_tr_yx (more than three times slower). 1082 !------------------------------------------------------------------------------! 1083 SUBROUTINE fftx_tr_xy( f_in, f_out ) 1084 1085 1086 USE control_parameters, & 1087 ONLY: loop_optimization 1088 1089 USE cpulog, & 1090 ONLY: cpu_log, log_point_s 1091 1092 USE kinds 1093 1094 USE pegrid 1095 1096 IMPLICIT NONE 1097 1098 INTEGER(iwp) :: i !< 1099 INTEGER(iwp) :: j !< 1100 INTEGER(iwp) :: k !< 1101 1102 REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !< 1103 REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !< 1104 REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !< 1105 REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !< 1106 1107 ! 1108 !-- Carry out the FFT along x, where all data are present due to the 1109 !-- 1d-decomposition along y. Resort the data in a way that y becomes 1110 !-- the first index. 1111 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' ) 1112 1113 IF ( loop_optimization == 'vector' ) THEN 1114 ! 1115 !-- Code for vector processors 1116 !$OMP PARALLEL PRIVATE ( i, j, k ) 1117 !$OMP DO 1118 DO i = 0, nx 1119 1120 DO j = nys, nyn 1121 DO k = 1, nz 1122 work_fftx(i,k,j) = f_in(k,j,i) 1123 ENDDO 1124 ENDDO 1125 1126 ENDDO 1127 1128 !$OMP DO 1129 DO j = nys, nyn 1130 1131 CALL fft_x_m( work_fftx(:,:,j), 'forward' ) 1132 1133 DO k = 1, nz 1134 DO i = 0, nx 1135 work(j,k,i) = work_fftx(i,k,j) 1136 ENDDO 1137 ENDDO 1138 1139 ENDDO 1140 !$OMP END PARALLEL 1141 1142 ELSE 1143 1144 ! 1145 !-- Cache optimized code (there might be still a potential for better 1146 !-- optimization). 1147 !$OMP PARALLEL PRIVATE (i,j,k) 1148 !$OMP DO 1149 DO i = 0, nx 1150 1151 DO j = nys, nyn 1152 DO k = 1, nz 1153 work_fftx(i,k,j) = f_in(k,j,i) 1154 ENDDO 1155 ENDDO 1156 1157 ENDDO 1158 1159 !$OMP DO 1105 !> @attention NEC-branch of this routine may significantly profit from further optimizations. So 1106 !> far, performance is much worse than for routine ffty_tr_yx (more than three times 1107 !> slower). 1108 !--------------------------------------------------------------------------------------------------! 1109 SUBROUTINE fftx_tr_xy( f_in, f_out ) 1110 1111 1112 USE control_parameters, & 1113 ONLY: loop_optimization 1114 1115 USE cpulog, & 1116 ONLY: cpu_log, & 1117 log_point_s 1118 1119 USE kinds 1120 1121 USE pegrid 1122 1123 IMPLICIT NONE 1124 1125 INTEGER(iwp) :: i !< 1126 INTEGER(iwp) :: j !< 1127 INTEGER(iwp) :: k !< 1128 1129 REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !< 1130 REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !< 1131 REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !< 1132 REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !< 1133 1134 ! 1135 !-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y. 1136 !-- Resort the data in a way that y becomes the first index. 1137 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' ) 1138 1139 IF ( loop_optimization == 'vector' ) THEN 1140 ! 1141 !-- Code for vector processors 1142 !$OMP PARALLEL PRIVATE ( i, j, k ) 1143 !$OMP DO 1144 DO i = 0, nx 1145 1160 1146 DO j = nys, nyn 1161 1147 DO k = 1, nz 1162 1163 CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' ) 1164 1165 DO i = 0, nx 1166 work(j,k,i) = work_fftx(i,k,j) 1167 ENDDO 1168 ENDDO 1169 1170 ENDDO 1171 !$OMP END PARALLEL 1172 1173 ENDIF 1174 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' ) 1175 1176 ! 1177 !-- Transpose array 1148 work_fftx(i,k,j) = f_in(k,j,i) 1149 ENDDO 1150 ENDDO 1151 1152 ENDDO 1153 1154 !$OMP DO 1155 DO j = nys, nyn 1156 1157 CALL fft_x_m( work_fftx(:,:,j), 'forward' ) 1158 1159 DO k = 1, nz 1160 DO i = 0, nx 1161 work(j,k,i) = work_fftx(i,k,j) 1162 ENDDO 1163 ENDDO 1164 1165 ENDDO 1166 !$OMP END PARALLEL 1167 1168 ELSE 1169 1170 ! 1171 !-- Cache optimized code (there might still be a potential for better optimization). 1172 !$OMP PARALLEL PRIVATE (i,j,k) 1173 !$OMP DO 1174 DO i = 0, nx 1175 1176 DO j = nys, nyn 1177 DO k = 1, nz 1178 work_fftx(i,k,j) = f_in(k,j,i) 1179 ENDDO 1180 ENDDO 1181 1182 ENDDO 1183 1184 !$OMP DO 1185 DO j = nys, nyn 1186 DO k = 1, nz 1187 1188 CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' ) 1189 1190 DO i = 0, nx 1191 work(j,k,i) = work_fftx(i,k,j) 1192 ENDDO 1193 ENDDO 1194 1195 ENDDO 1196 !$OMP END PARALLEL 1197 1198 ENDIF 1199 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' ) 1200 1201 ! 1202 !-- Transpose array 1178 1203 #if defined( __parallel ) 1179 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 1180 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 1181 CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, & 1182 f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, & 1183 comm1dy, ierr ) 1184 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 1204 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 1205 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 1206 CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), & 1207 sendrecvcount_xy, MPI_REAL, comm1dy, ierr ) 1208 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 1185 1209 #else 1186 1210 ! 1187 !-- 1188 1211 !-- Next line required to avoid compile error about unused dummy argument in serial mode 1212 i = SIZE( f_out ) 1189 1213 #endif 1190 1214 1191 1192 1193 1194 !------------------------------------------------------------------------------ !1215 END SUBROUTINE fftx_tr_xy 1216 1217 1218 !--------------------------------------------------------------------------------------------------! 1195 1219 ! Description: 1196 1220 ! ------------ 1197 !> Transposition y --> x with a subsequent backward Fourier transformation for 1198 !> a 1d-decomposition along x. 1199 !------------------------------------------------------------------------------! 1200 SUBROUTINE tr_yx_fftx( f_in, f_out ) 1201 1202 1203 USE control_parameters, & 1204 ONLY: loop_optimization 1205 1206 USE cpulog, & 1207 ONLY: cpu_log, log_point_s 1208 1209 USE kinds 1210 1211 USE pegrid 1212 1213 IMPLICIT NONE 1214 1215 INTEGER(iwp) :: i !< 1216 INTEGER(iwp) :: j !< 1217 INTEGER(iwp) :: k !< 1218 1219 REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !< 1220 REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !< 1221 REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !< 1222 REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !< 1223 1224 1225 ! 1226 !-- Transpose array 1221 !> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition 1222 !> along x. 1223 !--------------------------------------------------------------------------------------------------! 1224 SUBROUTINE tr_yx_fftx( f_in, f_out ) 1225 1226 1227 USE control_parameters, & 1228 ONLY: loop_optimization 1229 1230 USE cpulog, & 1231 ONLY: cpu_log, & 1232 log_point_s 1233 1234 USE kinds 1235 1236 USE pegrid 1237 1238 IMPLICIT NONE 1239 1240 INTEGER(iwp) :: i !< 1241 INTEGER(iwp) :: j !< 1242 INTEGER(iwp) :: k !< 1243 1244 REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !< 1245 REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !< 1246 REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !< 1247 REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !< 1248 1249 1250 ! 1251 !-- Transpose array 1227 1252 #if defined( __parallel ) 1228 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 1229 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 1230 CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, & 1231 work(nys,1,0), sendrecvcount_xy, MPI_REAL, & 1232 comm1dy, ierr ) 1233 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 1253 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 1254 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 1255 CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), & 1256 sendrecvcount_xy, MPI_REAL, comm1dy, ierr ) 1257 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 1234 1258 #else 1235 1259 ! 1236 !-- 1237 1260 !-- Next line required to avoid compile error about unused dummy argument in serial mode 1261 i = SIZE( f_in ) 1238 1262 #endif 1239 1263 1240 1264 ! 1241 !-- Carry out the FFT along x, where all data are present due to the 1242 !-- 1d-decomposition along y. Resort the data in a way that y becomes 1243 !-- the first index. 1244 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' ) 1245 1246 IF ( loop_optimization == 'vector' ) THEN 1247 ! 1248 !-- Code optimized for vector processors 1249 !$OMP PARALLEL PRIVATE ( i, j, k ) 1250 !$OMP DO 1251 DO j = nys, nyn 1252 1253 DO k = 1, nz 1254 DO i = 0, nx 1255 work_fftx(i,k,j) = work(j,k,i) 1256 ENDDO 1257 ENDDO 1258 1259 CALL fft_x_m( work_fftx(:,:,j), 'backward' ) 1260 1261 ENDDO 1262 1263 !$OMP DO 1264 DO i = 0, nx 1265 DO j = nys, nyn 1266 DO k = 1, nz 1267 f_out(k,j,i) = work_fftx(i,k,j) 1268 ENDDO 1269 ENDDO 1270 ENDDO 1271 !$OMP END PARALLEL 1272 1273 ELSE 1274 1275 ! 1276 !-- Cache optimized code (there might be still a potential for better 1277 !-- optimization). 1278 !$OMP PARALLEL PRIVATE (i,j,k) 1279 !$OMP DO 1265 !-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y. 1266 !-- Resort the data in a way that y becomes the first index. 1267 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' ) 1268 1269 IF ( loop_optimization == 'vector' ) THEN 1270 ! 1271 !-- Code optimized for vector processors 1272 !$OMP PARALLEL PRIVATE ( i, j, k ) 1273 !$OMP DO 1274 DO j = nys, nyn 1275 1276 DO k = 1, nz 1277 DO i = 0, nx 1278 work_fftx(i,k,j) = work(j,k,i) 1279 ENDDO 1280 ENDDO 1281 1282 CALL fft_x_m( work_fftx(:,:,j), 'backward' ) 1283 1284 ENDDO 1285 1286 !$OMP DO 1287 DO i = 0, nx 1280 1288 DO j = nys, nyn 1281 1289 DO k = 1, nz 1282 1283 DO i = 0, nx 1284 work_fftx(i,k,j) = work(j,k,i) 1285 ENDDO 1286 1287 CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' ) 1288 1289 ENDDO 1290 ENDDO 1291 1292 !$OMP DO 1293 DO i = 0, nx 1294 DO j = nys, nyn 1295 DO k = 1, nz 1296 f_out(k,j,i) = work_fftx(i,k,j) 1297 ENDDO 1298 ENDDO 1299 ENDDO 1300 !$OMP END PARALLEL 1301 1302 ENDIF 1303 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' ) 1304 1305 END SUBROUTINE tr_yx_fftx 1306 1307 1308 !------------------------------------------------------------------------------! 1290 f_out(k,j,i) = work_fftx(i,k,j) 1291 ENDDO 1292 ENDDO 1293 ENDDO 1294 !$OMP END PARALLEL 1295 1296 ELSE 1297 1298 ! 1299 !-- Cache optimized code (there might be still a potential for better optimization). 1300 !$OMP PARALLEL PRIVATE (i,j,k) 1301 !$OMP DO 1302 DO j = nys, nyn 1303 DO k = 1, nz 1304 1305 DO i = 0, nx 1306 work_fftx(i,k,j) = work(j,k,i) 1307 ENDDO 1308 1309 CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' ) 1310 1311 ENDDO 1312 ENDDO 1313 1314 !$OMP DO 1315 DO i = 0, nx 1316 DO j = nys, nyn 1317 DO k = 1, nz 1318 f_out(k,j,i) = work_fftx(i,k,j) 1319 ENDDO 1320 ENDDO 1321 ENDDO 1322 !$OMP END PARALLEL 1323 1324 ENDIF 1325 CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' ) 1326 1327 END SUBROUTINE tr_yx_fftx 1328 1329 1330 !--------------------------------------------------------------------------------------------------! 1309 1331 ! Description: 1310 1332 ! ------------ 1311 !> FFT along y, solution of the tridiagonal system and backward FFT for 1312 !> a 1d-decomposition along y. 1333 !> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y. 1313 1334 !> 1314 !> @warning this subroutine may still not work for hybrid parallelization 1315 !> with OpenMP (for possible necessary changes see the original 1316 !> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002) 1317 !------------------------------------------------------------------------------! 1318 SUBROUTINE ffty_tri_ffty( ar ) 1319 1320 1321 USE control_parameters, & 1322 ONLY: loop_optimization 1323 1324 USE cpulog, & 1325 ONLY: cpu_log, log_point_s 1326 1327 USE grid_variables, & 1328 ONLY: ddx2, ddy2 1329 1330 USE kinds 1331 1332 USE pegrid 1333 1334 IMPLICIT NONE 1335 1336 INTEGER(iwp) :: i !< 1337 INTEGER(iwp) :: j !< 1338 INTEGER(iwp) :: k !< 1339 INTEGER(iwp) :: m !< 1340 INTEGER(iwp) :: n !< 1341 !$ INTEGER(iwp) :: omp_get_thread_num !< 1342 INTEGER(iwp) :: tn !< 1343 1344 REAL(wp), DIMENSION(0:ny) :: work_ffty !< 1345 REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !< 1346 REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !< 1347 REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !< 1348 1349 1350 CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' ) 1351 1352 ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) ) 1353 1354 tn = 0 ! Default thread number in case of one thread 1355 !$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy ) 1356 DO i = nxl_y, nxr_y 1357 1358 !$ tn = omp_get_thread_num() 1359 1360 IF ( loop_optimization == 'vector' ) THEN 1361 ! 1362 !-- Code optimized for vector processors 1363 DO k = 1, nz 1364 1365 m = 0 1366 DO n = 1, pdims(2) 1367 DO j = 1, nny 1368 work_triy(m,k) = ar(j,k,i,n) 1369 m = m + 1 1370 ENDDO 1335 !> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible 1336 !> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen, 1337 !> May 2002) 1338 !--------------------------------------------------------------------------------------------------! 1339 SUBROUTINE ffty_tri_ffty( ar ) 1340 1341 1342 USE control_parameters, & 1343 ONLY: loop_optimization 1344 1345 USE cpulog, & 1346 ONLY: cpu_log, & 1347 log_point_s 1348 1349 USE grid_variables, & 1350 ONLY: ddx2, & 1351 ddy2 1352 1353 USE kinds 1354 1355 USE pegrid 1356 1357 IMPLICIT NONE 1358 1359 INTEGER(iwp) :: i !< 1360 INTEGER(iwp) :: j !< 1361 INTEGER(iwp) :: k !< 1362 INTEGER(iwp) :: m !< 1363 INTEGER(iwp) :: n !< 1364 !$ INTEGER(iwp) :: omp_get_thread_num !< 1365 INTEGER(iwp) :: tn !< 1366 1367 REAL(wp), DIMENSION(0:ny) :: work_ffty !< 1368 REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !< 1369 REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !< 1370 REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !< 1371 1372 1373 CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' ) 1374 1375 ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) ) 1376 1377 tn = 0 ! Default thread number in case of one thread 1378 !$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy ) 1379 DO i = nxl_y, nxr_y 1380 1381 !$ tn = omp_get_thread_num() 1382 1383 IF ( loop_optimization == 'vector' ) THEN 1384 ! 1385 !-- Code optimized for vector processors 1386 DO k = 1, nz 1387 1388 m = 0 1389 DO n = 1, pdims(2) 1390 DO j = 1, nny 1391 work_triy(m,k) = ar(j,k,i,n) 1392 m = m + 1 1371 1393 ENDDO 1372 1373 ENDDO 1374 1375 CALL fft_y_m( work_triy, ny, 'forward' ) 1376 1377 ELSE 1378 ! 1379 ! -- Cache optimized code1380 DO k = 1, nz 1381 1382 m = 0 1383 DO n = 1, pdims(2)1384 DO j = 1, nny1385 work_ffty(m) = ar(j,k,i,n)1386 m = m + 11387 ENDDO1394 ENDDO 1395 1396 ENDDO 1397 1398 CALL fft_y_m( work_triy, ny, 'forward' ) 1399 1400 ELSE 1401 ! 1402 !-- Cache optimized code 1403 DO k = 1, nz 1404 1405 m = 0 1406 DO n = 1, pdims(2) 1407 DO j = 1, nny 1408 work_ffty(m) = ar(j,k,i,n) 1409 m = m + 1 1388 1410 ENDDO 1389 1390 CALL fft_y_1d( work_ffty, 'forward' ) 1391 1392 DO j = 0, ny 1393 work_triy(j,k) = work_ffty(j) 1411 ENDDO 1412 1413 CALL fft_y_1d( work_ffty, 'forward' ) 1414 1415 DO j = 0, ny 1416 work_triy(j,k) = work_ffty(j) 1417 ENDDO 1418 1419 ENDDO 1420 1421 ENDIF 1422 1423 ! 1424 !-- Solve the linear equation system 1425 CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) ) 1426 1427 IF ( loop_optimization == 'vector' ) THEN 1428 ! 1429 !-- Code optimized for vector processors 1430 CALL fft_y_m( work_triy, ny, 'backward' ) 1431 1432 DO k = 1, nz 1433 1434 m = 0 1435 DO n = 1, pdims(2) 1436 DO j = 1, nny 1437 ar(j,k,i,n) = work_triy(m,k) 1438 m = m + 1 1394 1439 ENDDO 1395 1396 ENDDO 1397 1398 ENDIF 1399 1400 ! 1401 !-- Solve the linear equation system 1402 CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) ) 1403 1404 IF ( loop_optimization == 'vector' ) THEN 1405 ! 1406 !-- Code optimized for vector processors 1407 CALL fft_y_m( work_triy, ny, 'backward' ) 1408 1409 DO k = 1, nz 1410 1411 m = 0 1412 DO n = 1, pdims(2) 1413 DO j = 1, nny 1414 ar(j,k,i,n) = work_triy(m,k) 1415 m = m + 1 1416 ENDDO 1440 ENDDO 1441 1442 ENDDO 1443 1444 ELSE 1445 ! 1446 !-- Cache optimized code 1447 DO k = 1, nz 1448 1449 DO j = 0, ny 1450 work_ffty(j) = work_triy(j,k) 1451 ENDDO 1452 1453 CALL fft_y_1d( work_ffty, 'backward' ) 1454 1455 m = 0 1456 DO n = 1, pdims(2) 1457 DO j = 1, nny 1458 ar(j,k,i,n) = work_ffty(m) 1459 m = m + 1 1417 1460 ENDDO 1418 1419 ENDDO 1420 1421 ELSE 1422 ! 1423 !-- Cache optimized code 1424 DO k = 1, nz 1425 1426 DO j = 0, ny 1427 work_ffty(j) = work_triy(j,k) 1428 ENDDO 1429 1430 CALL fft_y_1d( work_ffty, 'backward' ) 1431 1432 m = 0 1433 DO n = 1, pdims(2) 1434 DO j = 1, nny 1435 ar(j,k,i,n) = work_ffty(m) 1436 m = m + 1 1437 ENDDO 1438 ENDDO 1439 1440 ENDDO 1441 1442 ENDIF 1443 1444 ENDDO 1445 1446 DEALLOCATE( tri ) 1447 1448 CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' ) 1449 1450 END SUBROUTINE ffty_tri_ffty 1461 ENDDO 1462 1463 ENDDO 1464 1465 ENDIF 1466 1467 ENDDO 1468 1469 DEALLOCATE( tri ) 1470 1471 CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' ) 1472 1473 END SUBROUTINE ffty_tri_ffty 1451 1474 1452 1475 END MODULE poisfft_mod
Note: See TracChangeset
for help on using the changeset viewer.