Changeset 1106 for palm/trunk/SOURCE/fft_xy.f90
- Timestamp:
- Mar 4, 2013 5:31:38 AM (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SOURCE/fft_xy.f90
r1093 r1106 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! CUDA fft added 23 ! array_kind renamed precision_kind, 3D- instead of 1D-loops in fft_x and fft_y 24 ! old fft_x, fft_y become fft_x_1d, fft_y_1d and are used for 1D-decomposition 23 25 ! 24 26 ! Former revisions: … … 62 64 !------------------------------------------------------------------------------! 63 65 64 USE array_kind65 66 USE control_parameters 66 67 USE indices 68 USE precision_kind 67 69 USE singleton 68 70 USE temperton_fft 71 USE transpose_indices 69 72 70 73 IMPLICIT NONE 71 74 72 75 PRIVATE 73 PUBLIC fft_x, fft_ y, fft_init, fft_x_m, fft_y_m76 PUBLIC fft_x, fft_x_1d, fft_y, fft_y_1d, fft_init, fft_x_m, fft_y_m 74 77 75 78 INTEGER, DIMENSION(:), ALLOCATABLE, SAVE :: ifax_x, ifax_y … … 77 80 LOGICAL, SAVE :: init_fft = .FALSE. 78 81 79 REAL, SAVE :: sqr_nx, sqr_ny82 REAL, SAVE :: dnx, dny, sqr_dnx, sqr_dny 80 83 REAL, DIMENSION(:), ALLOCATABLE, SAVE :: trigs_x, trigs_y 81 84 … … 90 93 REAL, DIMENSION(:), ALLOCATABLE, SAVE :: trig_xb, trig_xf, trig_yb, & 91 94 trig_yf 95 #elif defined( __cuda_fft ) 96 INTEGER, SAVE :: plan_xf, plan_xi, plan_yf, plan_yi, total_points_x_transpo, & 97 total_points_y_transpo 92 98 #endif 93 99 … … 102 108 END INTERFACE fft_x 103 109 110 INTERFACE fft_x_1d 111 MODULE PROCEDURE fft_x_1d 112 END INTERFACE fft_x_1d 113 104 114 INTERFACE fft_y 105 115 MODULE PROCEDURE fft_y 106 116 END INTERFACE fft_y 107 117 118 INTERFACE fft_y_1d 119 MODULE PROCEDURE fft_y_1d 120 END INTERFACE fft_y_1d 121 108 122 INTERFACE fft_x_m 109 123 MODULE PROCEDURE fft_x_m … … 118 132 119 133 SUBROUTINE fft_init 134 135 USE cuda_fft_interfaces 120 136 121 137 IMPLICIT NONE … … 145 161 IF ( fft_method == 'system-specific' ) THEN 146 162 147 sqr_nx = SQRT( 1.0 / ( nx + 1.0 ) ) 148 sqr_ny = SQRT( 1.0 / ( ny + 1.0 ) ) 163 dnx = 1.0 / ( nx + 1.0 ) 164 dny = 1.0 / ( ny + 1.0 ) 165 sqr_dnx = SQRT( dnx ) 166 sqr_dny = SQRT( dny ) 149 167 #if defined( __ibm ) && ! defined( __ibmy_special ) 150 168 ! 151 169 !-- Initialize tables for fft along x 152 CALL DRCFT( 1, workx, 1, workx, 1, nx+1, 1, 1, sqr_ nx, aux1, nau1, &170 CALL DRCFT( 1, workx, 1, workx, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, & 153 171 aux2, nau2 ) 154 CALL DCRFT( 1, workx, 1, workx, 1, nx+1, 1, -1, sqr_ nx, aux3, nau1, &172 CALL DCRFT( 1, workx, 1, workx, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, & 155 173 aux4, nau2 ) 156 174 ! 157 175 !-- Initialize tables for fft along y 158 CALL DRCFT( 1, worky, 1, worky, 1, ny+1, 1, 1, sqr_ ny, auy1, nau1, &176 CALL DRCFT( 1, worky, 1, worky, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, & 159 177 auy2, nau2 ) 160 CALL DCRFT( 1, worky, 1, worky, 1, ny+1, 1, -1, sqr_ ny, auy3, nau1, &178 CALL DCRFT( 1, worky, 1, worky, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, & 161 179 auy4, nau2 ) 162 180 #elif defined( __nec ) … … 175 193 ! 176 194 !-- Initialize tables for fft along x (non-vector and vector case (M)) 177 CALL DZFFT( 0, nx+1, sqr_ nx, work_x, work_x, trig_xf, workx, 0 )178 CALL ZDFFT( 0, nx+1, sqr_ nx, work_x, work_x, trig_xb, workx, 0 )179 CALL DZFFTM( 0, nx+1, nz1, sqr_ nx, work_x, nx+4, work_x, nx+4, &195 CALL DZFFT( 0, nx+1, sqr_dnx, work_x, work_x, trig_xf, workx, 0 ) 196 CALL ZDFFT( 0, nx+1, sqr_dnx, work_x, work_x, trig_xb, workx, 0 ) 197 CALL DZFFTM( 0, nx+1, nz1, sqr_dnx, work_x, nx+4, work_x, nx+4, & 180 198 trig_xf, workx, 0 ) 181 CALL ZDFFTM( 0, nx+1, nz1, sqr_ nx, work_x, nx+4, work_x, nx+4, &199 CALL ZDFFTM( 0, nx+1, nz1, sqr_dnx, work_x, nx+4, work_x, nx+4, & 182 200 trig_xb, workx, 0 ) 183 201 ! 184 202 !-- Initialize tables for fft along y (non-vector and vector case (M)) 185 CALL DZFFT( 0, ny+1, sqr_ ny, work_y, work_y, trig_yf, worky, 0 )186 CALL ZDFFT( 0, ny+1, sqr_ ny, work_y, work_y, trig_yb, worky, 0 )187 CALL DZFFTM( 0, ny+1, nz1, sqr_ ny, work_y, ny+4, work_y, ny+4, &203 CALL DZFFT( 0, ny+1, sqr_dny, work_y, work_y, trig_yf, worky, 0 ) 204 CALL ZDFFT( 0, ny+1, sqr_dny, work_y, work_y, trig_yb, worky, 0 ) 205 CALL DZFFTM( 0, ny+1, nz1, sqr_dny, work_y, ny+4, work_y, ny+4, & 188 206 trig_yf, worky, 0 ) 189 CALL ZDFFTM( 0, ny+1, nz1, sqr_ ny, work_y, ny+4, work_y, ny+4, &207 CALL ZDFFTM( 0, ny+1, nz1, sqr_dny, work_y, ny+4, work_y, ny+4, & 190 208 trig_yb, worky, 0 ) 209 #elif defined( __cuda_fft ) 210 total_points_x_transpo = (nx+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) 211 total_points_y_transpo = (ny+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) 212 CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (ny+1)*nz ) 213 CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (ny+1)*nz ) 214 CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nx+1)*nz ) 215 CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nx+1)*nz ) 191 216 #else 192 217 message_string = 'no system-specific fft-call available' … … 222 247 ! ! 223 248 ! Fourier-transformation along x-direction ! 249 ! Version for 2D-decomposition ! 224 250 ! ! 225 251 ! fft_x uses internal algorithms (Singleton or Temperton) or ! … … 227 253 !----------------------------------------------------------------------! 228 254 255 USE cuda_fft_interfaces 256 257 IMPLICIT NONE 258 259 CHARACTER (LEN=*) :: direction 260 INTEGER :: i, ishape(1), j, k, m 261 262 LOGICAL :: forward_fft 263 264 REAL, DIMENSION(0:nx+2) :: work 265 REAL, DIMENSION(nx+2) :: work1 266 COMPLEX, DIMENSION(:), ALLOCATABLE :: cwork 267 #if defined( __ibm ) 268 REAL, DIMENSION(nau2) :: aux2, aux4 269 #elif defined( __nec ) 270 REAL, DIMENSION(6*(nx+1)) :: work2 271 #elif defined( __cuda_fft ) 272 REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device 273 COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device 274 COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host 275 #endif 276 REAL, DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) :: ar 277 278 IF ( direction == 'forward' ) THEN 279 forward_fft = .TRUE. 280 ELSE 281 forward_fft = .FALSE. 282 ENDIF 283 284 IF ( fft_method == 'singleton-algorithm' ) THEN 285 286 ! 287 !-- Performing the fft with singleton's software works on every system, 288 !-- since it is part of the model 289 ALLOCATE( cwork(0:nx) ) 290 291 IF ( forward_fft ) then 292 293 !$OMP PARALLEL PRIVATE ( cwork, i, ishape, j, k ) 294 !$OMP DO 295 DO k = nzb_x, nzt_x 296 DO j = nys_x, nyn_x 297 298 DO i = 0, nx 299 cwork(i) = CMPLX( ar(i,j,k) ) 300 ENDDO 301 302 ishape = SHAPE( cwork ) 303 CALL FFTN( cwork, ishape ) 304 305 DO i = 0, (nx+1)/2 306 ar(i,j,k) = REAL( cwork(i) ) 307 ENDDO 308 DO i = 1, (nx+1)/2 - 1 309 ar(nx+1-i,j,k) = -AIMAG( cwork(i) ) 310 ENDDO 311 312 ENDDO 313 ENDDO 314 !$OMP END PARALLEL 315 316 ELSE 317 318 !$OMP PARALLEL PRIVATE ( cwork, i, ishape, j, k ) 319 !$OMP DO 320 DO k = nzb_x, nzt_x 321 DO j = nys_x, nyn_x 322 323 cwork(0) = CMPLX( ar(0,j,k), 0.0 ) 324 DO i = 1, (nx+1)/2 - 1 325 cwork(i) = CMPLX( ar(i,j,k), -ar(nx+1-i,j,k) ) 326 cwork(nx+1-i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) 327 ENDDO 328 cwork((nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) 329 330 ishape = SHAPE( cwork ) 331 CALL FFTN( cwork, ishape, inv = .TRUE. ) 332 333 DO i = 0, nx 334 ar(i,j,k) = REAL( cwork(i) ) 335 ENDDO 336 337 ENDDO 338 ENDDO 339 !$OMP END PARALLEL 340 341 ENDIF 342 343 DEALLOCATE( cwork ) 344 345 ELSEIF ( fft_method == 'temperton-algorithm' ) THEN 346 347 ! 348 !-- Performing the fft with Temperton's software works on every system, 349 !-- since it is part of the model 350 IF ( forward_fft ) THEN 351 352 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 353 !$OMP DO 354 DO k = nzb_x, nzt_x 355 DO j = nys_x, nyn_x 356 357 work(0:nx) = ar(0:nx,j,k) 358 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, -1 ) 359 360 DO i = 0, (nx+1)/2 361 ar(i,j,k) = work(2*i) 362 ENDDO 363 DO i = 1, (nx+1)/2 - 1 364 ar(nx+1-i,j,k) = work(2*i+1) 365 ENDDO 366 367 ENDDO 368 ENDDO 369 !$OMP END PARALLEL 370 371 ELSE 372 373 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 374 !$OMP DO 375 DO k = nzb_x, nzt_x 376 DO j = nys_x, nyn_x 377 378 DO i = 0, (nx+1)/2 379 work(2*i) = ar(i,j,k) 380 ENDDO 381 DO i = 1, (nx+1)/2 - 1 382 work(2*i+1) = ar(nx+1-i,j,k) 383 ENDDO 384 work(1) = 0.0 385 work(nx+2) = 0.0 386 387 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, 1 ) 388 ar(0:nx,j,k) = work(0:nx) 389 390 ENDDO 391 ENDDO 392 !$OMP END PARALLEL 393 394 ENDIF 395 396 ELSEIF ( fft_method == 'system-specific' ) THEN 397 398 #if defined( __ibm ) && ! defined( __ibmy_special ) 399 IF ( forward_fft ) THEN 400 401 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 402 !$OMP DO 403 DO k = nzb_x, nzt_x 404 DO j = nys_x, nyn_x 405 406 CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, & 407 aux2, nau2 ) 408 409 DO i = 0, (nx+1)/2 410 ar(i,j,k) = work(2*i) 411 ENDDO 412 DO i = 1, (nx+1)/2 - 1 413 ar(nx+1-i,j,k) = work(2*i+1) 414 ENDDO 415 416 ENDDO 417 ENDDO 418 !$OMP END PARALLEL 419 420 ELSE 421 422 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 423 !$OMP DO 424 DO k = nzb_x, nzt_x 425 DO j = nys_x, nyn_x 426 427 DO i = 0, (nx+1)/2 428 work(2*i) = ar(i,j,k) 429 ENDDO 430 DO i = 1, (nx+1)/2 - 1 431 work(2*i+1) = ar(nx+1-i,j,k) 432 ENDDO 433 work(1) = 0.0 434 work(nx+2) = 0.0 435 436 CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, & 437 aux4, nau2 ) 438 439 DO i = 0, nx 440 ar(i,j,k) = work(i) 441 ENDDO 442 443 ENDDO 444 ENDDO 445 !$OMP END PARALLEL 446 447 ENDIF 448 449 #elif defined( __nec ) 450 451 IF ( forward_fft ) THEN 452 453 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 454 !$OMP DO 455 DO k = nzb_x, nzt_x 456 DO j = nys_x, nyn_x 457 458 work(0:nx) = ar(0:nx,j,k) 459 460 CALL DZFFT( 1, nx+1, sqr_dnx, work, work, trig_xf, work2, 0 ) 461 462 DO i = 0, (nx+1)/2 463 ar(i,j,k) = work(2*i) 464 ENDDO 465 DO i = 1, (nx+1)/2 - 1 466 ar(nx+1-i,j,k) = work(2*i+1) 467 ENDDO 468 469 ENDDO 470 ENDDO 471 !$END OMP PARALLEL 472 473 ELSE 474 475 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 476 !$OMP DO 477 DO k = nzb_x, nzt_x 478 DO j = nys_x, nyn_x 479 480 DO i = 0, (nx+1)/2 481 work(2*i) = ar(i,j,k) 482 ENDDO 483 DO i = 1, (nx+1)/2 - 1 484 work(2*i+1) = ar(nx+1-i,j,k) 485 ENDDO 486 work(1) = 0.0 487 work(nx+2) = 0.0 488 489 CALL ZDFFT( -1, nx+1, sqr_dnx, work, work, trig_xb, work2, 0 ) 490 491 ar(0:nx,j,k) = work(0:nx) 492 493 ENDDO 494 ENDDO 495 !$OMP END PARALLEL 496 497 ENDIF 498 499 #elif defined( __cuda_fft ) 500 501 ALLOCATE( cuda_a_device(0:total_points_x_transpo-1) ) 502 ALLOCATE( cuda_b_device(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) ) 503 ALLOCATE( cuda_host(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) ) 504 505 m = 0 506 507 IF ( forward_fft ) THEN 508 509 cuda_a_device = ar(0:total_points_x_transpo-1,nys_x,nzb_x) 510 511 CALL CUFFTEXECD2Z( plan_xf, cuda_a_device, cuda_b_device ) 512 cuda_host = cuda_b_device 513 514 DO k = nzb_x, nzt_x 515 DO j = nys_x, nyn_x 516 517 DO i = 0, (nx+1)/2 518 ar(i,j,k) = REAL( cuda_host(m+i) ) * dnx 519 ENDDO 520 521 DO i = 1, (nx+1)/2 - 1 522 ar(nx+1-i,j,k) = AIMAG( cuda_host(m+i) ) * dnx 523 ENDDO 524 525 m = m + (nx+1)/2 + 1 526 527 ENDDO 528 ENDDO 529 530 ELSE 531 532 DO k = nzb_x, nzt_x 533 DO j = nys_x, nyn_x 534 535 cuda_host(m) = CMPLX( ar(0,j,k), 0.0 ) 536 537 DO i = 1, (nx+1)/2 - 1 538 cuda_host(m+i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) 539 ENDDO 540 cuda_host(m+(nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) 541 542 m = m + (nx+1)/2 + 1 543 544 ENDDO 545 ENDDO 546 547 cuda_b_device = cuda_host 548 CALL CUFFTEXECZ2D( plan_xi, cuda_b_device, cuda_a_device ) 549 550 ar(0:total_points_x_transpo-1,nys_x,nzb_x) = cuda_a_device 551 552 ENDIF 553 554 DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) 555 556 #else 557 message_string = 'no system-specific fft-call available' 558 CALL message( 'fft_x', 'PA0188', 1, 2, 0, 6, 0 ) 559 #endif 560 561 ELSE 562 563 message_string = 'fft method "' // TRIM( fft_method) // & 564 '" not available' 565 CALL message( 'fft_x', 'PA0189', 1, 2, 0, 6, 0 ) 566 567 ENDIF 568 569 END SUBROUTINE fft_x 570 571 SUBROUTINE fft_x_1d( ar, direction ) 572 573 !----------------------------------------------------------------------! 574 ! fft_x_1d ! 575 ! ! 576 ! Fourier-transformation along x-direction ! 577 ! Version for 1D-decomposition ! 578 ! ! 579 ! fft_x uses internal algorithms (Singleton or Temperton) or ! 580 ! system-specific routines, if they are available ! 581 !----------------------------------------------------------------------! 582 229 583 IMPLICIT NONE 230 584 … … 232 586 INTEGER :: i, ishape(1) 233 587 234 !kk REAL, DIMENSION(:) :: ar !kk Does NOT work (Bug??) 588 LOGICAL :: forward_fft 589 235 590 REAL, DIMENSION(0:nx) :: ar 236 591 REAL, DIMENSION(0:nx+2) :: work … … 243 598 #endif 244 599 600 IF ( direction == 'forward' ) THEN 601 forward_fft = .TRUE. 602 ELSE 603 forward_fft = .FALSE. 604 ENDIF 605 245 606 IF ( fft_method == 'singleton-algorithm' ) THEN 246 607 … … 250 611 ALLOCATE( cwork(0:nx) ) 251 612 252 IF ( direction == 'forward') then613 IF ( forward_fft ) then 253 614 254 615 DO i = 0, nx … … 257 618 ishape = SHAPE( cwork ) 258 619 CALL FFTN( cwork, ishape ) 259 260 620 DO i = 0, (nx+1)/2 261 621 ar(i) = REAL( cwork(i) ) … … 290 650 !-- Performing the fft with Temperton's software works on every system, 291 651 !-- since it is part of the model 292 IF ( direction == 'forward') THEN652 IF ( forward_fft ) THEN 293 653 294 654 work(0:nx) = ar … … 321 681 322 682 #if defined( __ibm ) && ! defined( __ibmy_special ) 323 IF ( direction == 'forward') THEN324 325 CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_ nx, aux1, nau1, &683 IF ( forward_fft ) THEN 684 685 CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, & 326 686 aux2, nau2 ) 327 687 … … 344 704 work(nx+2) = 0.0 345 705 346 CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_ nx, aux3, nau1, &706 CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, & 347 707 aux4, nau2 ) 348 708 … … 353 713 ENDIF 354 714 #elif defined( __nec ) 355 IF ( direction == 'forward') THEN715 IF ( forward_fft ) THEN 356 716 357 717 work(0:nx) = ar(0:nx) 358 718 359 CALL DZFFT( 1, nx+1, sqr_ nx, work, work, trig_xf, work2, 0 )360 719 CALL DZFFT( 1, nx+1, sqr_dnx, work, work, trig_xf, work2, 0 ) 720 361 721 DO i = 0, (nx+1)/2 362 722 ar(i) = work(2*i) … … 377 737 work(nx+2) = 0.0 378 738 379 CALL ZDFFT( -1, nx+1, sqr_ nx, work, work, trig_xb, work2, 0 )739 CALL ZDFFT( -1, nx+1, sqr_dnx, work, work, trig_xb, work2, 0 ) 380 740 381 741 ar(0:nx) = work(0:nx) … … 384 744 #else 385 745 message_string = 'no system-specific fft-call available' 386 CALL message( 'fft_x ', 'PA0188', 1, 2, 0, 6, 0 )746 CALL message( 'fft_x_1d', 'PA0188', 1, 2, 0, 6, 0 ) 387 747 #endif 388 748 ELSE 389 749 message_string = 'fft method "' // TRIM( fft_method) // & 390 750 '" not available' 391 CALL message( 'fft_x ', 'PA0189', 1, 2, 0, 6, 0 )751 CALL message( 'fft_x_1d', 'PA0189', 1, 2, 0, 6, 0 ) 392 752 393 753 ENDIF 394 754 395 END SUBROUTINE fft_x 755 END SUBROUTINE fft_x_1d 396 756 397 757 SUBROUTINE fft_y( ar, direction ) … … 401 761 ! ! 402 762 ! Fourier-transformation along y-direction ! 763 ! Version for 2D-decomposition ! 403 764 ! ! 404 765 ! fft_y uses internal algorithms (Singleton or Temperton) or ! … … 406 767 !----------------------------------------------------------------------! 407 768 769 USE cuda_fft_interfaces 770 771 IMPLICIT NONE 772 773 CHARACTER (LEN=*) :: direction 774 INTEGER :: i, j, jshape(1), k, m 775 776 LOGICAL :: forward_fft 777 778 REAL, DIMENSION(0:ny+2) :: work 779 REAL, DIMENSION(ny+2) :: work1 780 COMPLEX, DIMENSION(:), ALLOCATABLE :: cwork 781 #if defined( __ibm ) 782 REAL, DIMENSION(nau2) :: auy2, auy4 783 #elif defined( __nec ) 784 REAL, DIMENSION(6*(ny+1)) :: work2 785 #elif defined( __cuda_fft ) 786 REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device 787 COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device 788 COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host 789 #endif 790 REAL, DIMENSION(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) :: ar 791 792 IF ( direction == 'forward' ) THEN 793 forward_fft = .TRUE. 794 ELSE 795 forward_fft = .FALSE. 796 ENDIF 797 798 IF ( fft_method == 'singleton-algorithm' ) THEN 799 800 ! 801 !-- Performing the fft with singleton's software works on every system, 802 !-- since it is part of the model 803 ALLOCATE( cwork(0:ny) ) 804 805 IF ( forward_fft ) then 806 807 !$OMP PARALLEL PRIVATE ( cwork, i, jshape, j, k ) 808 !$OMP DO 809 DO k = nzb_y, nzt_y 810 DO i = nxl_y, nxr_y 811 812 DO j = 0, ny 813 cwork(j) = CMPLX( ar(j,i,k) ) 814 ENDDO 815 816 jshape = SHAPE( cwork ) 817 CALL FFTN( cwork, jshape ) 818 819 DO j = 0, (ny+1)/2 820 ar(j,i,k) = REAL( cwork(j) ) 821 ENDDO 822 DO j = 1, (ny+1)/2 - 1 823 ar(ny+1-j,i,k) = -AIMAG( cwork(j) ) 824 ENDDO 825 826 ENDDO 827 ENDDO 828 !$OMP END PARALLEL 829 830 ELSE 831 832 !$OMP PARALLEL PRIVATE ( cwork, i, jshape, j, k ) 833 !$OMP DO 834 DO k = nzb_y, nzt_y 835 DO i = nxl_y, nxr_y 836 837 cwork(0) = CMPLX( ar(0,i,k), 0.0 ) 838 DO j = 1, (ny+1)/2 - 1 839 cwork(j) = CMPLX( ar(j,i,k), -ar(ny+1-j,i,k) ) 840 cwork(ny+1-j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) 841 ENDDO 842 cwork((ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) 843 844 jshape = SHAPE( cwork ) 845 CALL FFTN( cwork, jshape, inv = .TRUE. ) 846 847 DO j = 0, ny 848 ar(j,i,k) = REAL( cwork(j) ) 849 ENDDO 850 851 ENDDO 852 ENDDO 853 !$OMP END PARALLEL 854 855 ENDIF 856 857 DEALLOCATE( cwork ) 858 859 ELSEIF ( fft_method == 'temperton-algorithm' ) THEN 860 861 ! 862 !-- Performing the fft with Temperton's software works on every system, 863 !-- since it is part of the model 864 IF ( forward_fft ) THEN 865 866 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 867 !$OMP DO 868 DO k = nzb_y, nzt_y 869 DO i = nxl_y, nxr_y 870 871 work(0:ny) = ar(0:ny,i,k) 872 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, -1 ) 873 874 DO j = 0, (ny+1)/2 875 ar(j,i,k) = work(2*j) 876 ENDDO 877 DO j = 1, (ny+1)/2 - 1 878 ar(ny+1-j,i,k) = work(2*j+1) 879 ENDDO 880 881 ENDDO 882 ENDDO 883 !$OMP END PARALLEL 884 885 ELSE 886 887 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 888 !$OMP DO 889 DO k = nzb_y, nzt_y 890 DO i = nxl_y, nxr_y 891 892 DO j = 0, (ny+1)/2 893 work(2*j) = ar(j,i,k) 894 ENDDO 895 DO j = 1, (ny+1)/2 - 1 896 work(2*j+1) = ar(ny+1-j,i,k) 897 ENDDO 898 work(1) = 0.0 899 work(ny+2) = 0.0 900 901 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, 1 ) 902 ar(0:ny,i,k) = work(0:ny) 903 904 ENDDO 905 ENDDO 906 !$OMP END PARALLEL 907 908 ENDIF 909 910 ELSEIF ( fft_method == 'system-specific' ) THEN 911 912 #if defined( __ibm ) && ! defined( __ibmy_special ) 913 IF ( forward_fft) THEN 914 915 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 916 !$OMP DO 917 DO k = nzb_y, nzt_y 918 DO i = nxl_y, nxr_y 919 920 CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, & 921 auy2, nau2 ) 922 923 DO j = 0, (ny+1)/2 924 ar(j,i,k) = work(2*j) 925 ENDDO 926 DO j = 1, (ny+1)/2 - 1 927 ar(ny+1-j,i,k) = work(2*j+1) 928 ENDDO 929 930 ENDDO 931 ENDDO 932 !$OMP END PARALLEL 933 934 ELSE 935 936 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 937 !$OMP DO 938 DO k = nzb_y, nzt_y 939 DO i = nxl_y, nxr_y 940 941 DO j = 0, (ny+1)/2 942 work(2*j) = ar(j,i,k) 943 ENDDO 944 DO j = 1, (ny+1)/2 - 1 945 work(2*j+1) = ar(ny+1-j,i,k) 946 ENDDO 947 work(1) = 0.0 948 work(ny+2) = 0.0 949 950 CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, & 951 auy4, nau2 ) 952 953 DO j = 0, ny 954 ar(j,i,k) = work(j) 955 ENDDO 956 957 ENDDO 958 ENDDO 959 !$OMP END PARALLEL 960 961 ENDIF 962 #elif defined( __nec ) 963 IF ( forward_fft ) THEN 964 965 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 966 !$OMP DO 967 DO k = nzb_y, nzt_y 968 DO i = nxl_y, nxr_y 969 970 work(0:ny) = ar(0:ny,i,k) 971 972 CALL DZFFT( 1, ny+1, sqr_dny, work, work, trig_yf, work2, 0 ) 973 974 DO j = 0, (ny+1)/2 975 ar(j,i,k) = work(2*j) 976 ENDDO 977 DO j = 1, (ny+1)/2 - 1 978 ar(ny+1-j,i,k) = work(2*j+1) 979 ENDDO 980 981 ENDDO 982 ENDDO 983 !$END OMP PARALLEL 984 985 ELSE 986 987 !$OMP PARALLEL PRIVATE ( work, i, j, k ) 988 !$OMP DO 989 DO k = nzb_y, nzt_y 990 DO i = nxl_y, nxr_y 991 992 DO j = 0, (ny+1)/2 993 work(2*j) = ar(j,i,k) 994 ENDDO 995 DO j = 1, (ny+1)/2 - 1 996 work(2*j+1) = ar(ny+1-j,i,k) 997 ENDDO 998 work(1) = 0.0 999 work(ny+2) = 0.0 1000 1001 CALL ZDFFT( -1, ny+1, sqr_dny, work, work, trig_yb, work2, 0 ) 1002 1003 ar(0:ny,i,k) = work(0:ny) 1004 1005 ENDDO 1006 ENDDO 1007 !$OMP END PARALLEL 1008 1009 ENDIF 1010 #elif defined( __cuda_fft ) 1011 1012 ALLOCATE( cuda_a_device(0:total_points_y_transpo-1) ) 1013 ALLOCATE( cuda_b_device(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) ) 1014 ALLOCATE( cuda_host(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) ) 1015 1016 m = 0 1017 1018 IF ( forward_fft ) THEN 1019 1020 cuda_a_device = ar(0:total_points_y_transpo-1,nxl_y,nzb_y) 1021 1022 CALL CUFFTEXECD2Z( plan_yf, cuda_a_device, cuda_b_device ) 1023 cuda_host = cuda_b_device 1024 1025 DO k = nzb_y, nzt_y 1026 DO i = nxl_y, nxr_y 1027 1028 DO j = 0, (ny+1)/2 1029 ar(j,i,k) = REAL( cuda_host(m+j) ) * dny 1030 ENDDO 1031 1032 DO j = 1, (ny+1)/2 - 1 1033 ar(ny+1-j,i,k) = AIMAG( cuda_host(m+j) ) * dny 1034 ENDDO 1035 1036 m = m + (ny+1)/2 + 1 1037 1038 ENDDO 1039 ENDDO 1040 1041 ELSE 1042 1043 DO k = nzb_y, nzt_y 1044 DO i = nxl_y, nxr_y 1045 1046 cuda_host(m) = CMPLX( ar(0,i,k), 0.0 ) 1047 1048 DO j = 1, (ny+1)/2 - 1 1049 cuda_host(m+j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) 1050 ENDDO 1051 cuda_host(m+(ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) 1052 1053 m = m + (ny+1)/2 + 1 1054 1055 ENDDO 1056 ENDDO 1057 1058 cuda_b_device = cuda_host 1059 CALL CUFFTEXECZ2D( plan_yi, cuda_b_device, cuda_a_device ) 1060 1061 ar(0:total_points_y_transpo-1,nxl_y,nzb_y) = cuda_a_device 1062 1063 ENDIF 1064 1065 DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) 1066 1067 #else 1068 message_string = 'no system-specific fft-call available' 1069 CALL message( 'fft_y', 'PA0188', 1, 2, 0, 6, 0 ) 1070 #endif 1071 1072 ELSE 1073 1074 message_string = 'fft method "' // TRIM( fft_method) // & 1075 '" not available' 1076 CALL message( 'fft_y', 'PA0189', 1, 2, 0, 6, 0 ) 1077 1078 ENDIF 1079 1080 END SUBROUTINE fft_y 1081 1082 SUBROUTINE fft_y_1d( ar, direction ) 1083 1084 !----------------------------------------------------------------------! 1085 ! fft_y_1d ! 1086 ! ! 1087 ! Fourier-transformation along y-direction ! 1088 ! Version for 1D-decomposition ! 1089 ! ! 1090 ! fft_y uses internal algorithms (Singleton or Temperton) or ! 1091 ! system-specific routines, if they are available ! 1092 !----------------------------------------------------------------------! 1093 408 1094 IMPLICIT NONE 409 1095 … … 411 1097 INTEGER :: j, jshape(1) 412 1098 413 !kk REAL, DIMENSION(:) :: ar !kk Does NOT work (Bug??) 1099 LOGICAL :: forward_fft 1100 414 1101 REAL, DIMENSION(0:ny) :: ar 415 1102 REAL, DIMENSION(0:ny+2) :: work … … 422 1109 #endif 423 1110 1111 IF ( direction == 'forward' ) THEN 1112 forward_fft = .TRUE. 1113 ELSE 1114 forward_fft = .FALSE. 1115 ENDIF 1116 424 1117 IF ( fft_method == 'singleton-algorithm' ) THEN 425 1118 … … 429 1122 ALLOCATE( cwork(0:ny) ) 430 1123 431 IF ( direction == 'forward') THEN1124 IF ( forward_fft ) THEN 432 1125 433 1126 DO j = 0, ny … … 470 1163 !-- Performing the fft with Temperton's software works on every system, 471 1164 !-- since it is part of the model 472 IF ( direction == 'forward') THEN1165 IF ( forward_fft ) THEN 473 1166 474 1167 work(0:ny) = ar … … 501 1194 502 1195 #if defined( __ibm ) && ! defined( __ibmy_special ) 503 IF ( direction == 'forward') THEN504 505 CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_ ny, auy1, nau1, &1196 IF ( forward_fft ) THEN 1197 1198 CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, & 506 1199 auy2, nau2 ) 507 1200 … … 524 1217 work(ny+2) = 0.0 525 1218 526 CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_ ny, auy3, nau1, &1219 CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, & 527 1220 auy4, nau2 ) 528 1221 … … 533 1226 ENDIF 534 1227 #elif defined( __nec ) 535 IF ( direction == 'forward') THEN1228 IF ( forward_fft ) THEN 536 1229 537 1230 work(0:ny) = ar(0:ny) 538 1231 539 CALL DZFFT( 1, ny+1, sqr_ ny, work, work, trig_yf, work2, 0 )1232 CALL DZFFT( 1, ny+1, sqr_dny, work, work, trig_yf, work2, 0 ) 540 1233 541 1234 DO j = 0, (ny+1)/2 … … 557 1250 work(ny+2) = 0.0 558 1251 559 CALL ZDFFT( -1, ny+1, sqr_ ny, work, work, trig_yb, work2, 0 )1252 CALL ZDFFT( -1, ny+1, sqr_dny, work, work, trig_yb, work2, 0 ) 560 1253 561 1254 ar(0:ny) = work(0:ny) … … 564 1257 #else 565 1258 message_string = 'no system-specific fft-call available' 566 CALL message( 'fft_y ', 'PA0188', 1, 2, 0, 6, 0 )1259 CALL message( 'fft_y_1d', 'PA0188', 1, 2, 0, 6, 0 ) 567 1260 568 1261 #endif … … 572 1265 message_string = 'fft method "' // TRIM( fft_method) // & 573 1266 '" not available' 574 CALL message( 'fft_y ', 'PA0189', 1, 2, 0, 6, 0 )1267 CALL message( 'fft_y_1d', 'PA0189', 1, 2, 0, 6, 0 ) 575 1268 576 1269 ENDIF 577 1270 578 END SUBROUTINE fft_y 1271 END SUBROUTINE fft_y_1d 579 1272 580 1273 SUBROUTINE fft_x_m( ar, direction ) … … 654 1347 !-- Tables are initialized once more. This call should not be 655 1348 !-- necessary, but otherwise program aborts in asymmetric case 656 CALL DZFFTM( 0, nx+1, nz1, sqr_ nx, work, nx+4, work, nx+4, &1349 CALL DZFFTM( 0, nx+1, nz1, sqr_dnx, work, nx+4, work, nx+4, & 657 1350 trig_xf, work1, 0 ) 658 1351 … … 662 1355 ENDIF 663 1356 664 CALL DZFFTM( 1, nx+1, nz1, sqr_ nx, ai, siza, work, sizw, &1357 CALL DZFFTM( 1, nx+1, nz1, sqr_dnx, ai, siza, work, sizw, & 665 1358 trig_xf, work1, 0 ) 666 1359 … … 679 1372 !-- Tables are initialized once more. This call should not be 680 1373 !-- necessary, but otherwise program aborts in asymmetric case 681 CALL ZDFFTM( 0, nx+1, nz1, sqr_ nx, work, nx+4, work, nx+4, &1374 CALL ZDFFTM( 0, nx+1, nz1, sqr_dnx, work, nx+4, work, nx+4, & 682 1375 trig_xb, work1, 0 ) 683 1376 … … 693 1386 ENDDO 694 1387 695 CALL ZDFFTM( -1, nx+1, nz1, sqr_ nx, work, sizw, ai, siza, &1388 CALL ZDFFTM( -1, nx+1, nz1, sqr_dnx, work, sizw, ai, siza, & 696 1389 trig_xb, work1, 0 ) 697 1390 … … 791 1484 !-- Tables are initialized once more. This call should not be 792 1485 !-- necessary, but otherwise program aborts in asymmetric case 793 CALL DZFFTM( 0, ny+1, nz1, sqr_ ny, work, ny+4, work, ny+4, &1486 CALL DZFFTM( 0, ny+1, nz1, sqr_dny, work, ny+4, work, ny+4, & 794 1487 trig_yf, work1, 0 ) 795 1488 … … 799 1492 ENDIF 800 1493 801 CALL DZFFTM( 1, ny+1, nz1, sqr_ ny, ai, siza, work, sizw, &1494 CALL DZFFTM( 1, ny+1, nz1, sqr_dny, ai, siza, work, sizw, & 802 1495 trig_yf, work1, 0 ) 803 1496 … … 816 1509 !-- Tables are initialized once more. This call should not be 817 1510 !-- necessary, but otherwise program aborts in asymmetric case 818 CALL ZDFFTM( 0, ny+1, nz1, sqr_ ny, work, ny+4, work, ny+4, &1511 CALL ZDFFTM( 0, ny+1, nz1, sqr_dny, work, ny+4, work, ny+4, & 819 1512 trig_yb, work1, 0 ) 820 1513 … … 830 1523 ENDDO 831 1524 832 CALL ZDFFTM( -1, ny+1, nz1, sqr_ ny, work, sizw, ai, siza, &1525 CALL ZDFFTM( -1, ny+1, nz1, sqr_dny, work, sizw, ai, siza, & 833 1526 trig_yb, work1, 0 ) 834 1527 … … 852 1545 END SUBROUTINE fft_y_m 853 1546 1547 854 1548 END MODULE fft_xy
Note: See TracChangeset
for help on using the changeset viewer.