- Timestamp:
- Jan 22, 2019 10:56:42 PM (6 years ago)
- Location:
- palm/trunk
- Files:
-
- 2 added
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SOURCE/poisfft_mod.f90
r3655 r3690 253 253 REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !< 254 254 REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !< 255 256 #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) ) 257 #if __acc_fft_device 255 258 !$ACC DECLARE CREATE(ar_inv) 259 #endif 256 260 257 261 REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !< … … 265 269 266 270 IF ( .NOT. poisfft_initialized ) CALL poisfft_init 271 272 #if !__acc_fft_device 273 !$ACC UPDATE HOST(ar) 274 #endif 267 275 268 276 #ifndef _OPENACC … … 705 713 706 714 ENDIF 715 #endif 716 717 #if !__acc_fft_device 718 !$ACC UPDATE DEVICE(ar) 707 719 #endif 708 720 -
palm/trunk/SOURCE/transpose.f90
r3657 r3690 91 91 ! Initial revision 92 92 ! 93 94 #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) ) 95 93 96 !------------------------------------------------------------------------------! 94 97 ! Description: … … 122 125 !$OMP PARALLEL PRIVATE ( i, j, k ) 123 126 !$OMP DO 127 #if __acc_fft_device 124 128 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 125 129 !$ACC PRESENT(f_inv, f_in) 130 #endif 126 131 DO i = 0, nx 127 132 DO k = nzb_x, nzt_x … … 171 176 172 177 REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< 178 #if __acc_fft_device 173 179 !$ACC DECLARE CREATE(work) 180 #endif 174 181 175 182 … … 180 187 !-- Transpose array 181 188 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) 189 190 #if __acc_fft_device 182 191 #ifndef __cuda_aware_mpi 183 192 !$ACC UPDATE HOST(f_inv) … … 185 194 !$ACC HOST_DATA USE_DEVICE(work, f_inv) 186 195 #endif 196 #endif 197 187 198 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 188 199 CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 189 200 work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & 190 201 comm1dy, ierr ) 202 203 #if __acc_fft_device 191 204 #ifndef __cuda_aware_mpi 192 205 !$ACC UPDATE DEVICE(work) … … 194 207 !$ACC END HOST_DATA 195 208 #endif 209 #endif 210 196 211 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 197 212 … … 202 217 DO l = 0, pdims(2) - 1 203 218 ys = 0 + l * ( nyn_x - nys_x + 1 ) 219 #if __acc_fft_device 204 220 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 205 221 !$ACC PRESENT(f_out, work) 222 #endif 206 223 DO i = nxl_y, nxr_y 207 224 DO k = nzb_y, nzt_y … … 221 238 !$OMP PARALLEL PRIVATE ( i, j, k ) 222 239 !$OMP DO 240 #if __acc_fft_device 223 241 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 224 242 !$ACC PRESENT(f_out, f_inv) 243 #endif 225 244 DO k = nzb_y, nzt_y 226 245 DO i = nxl_y, nxr_y … … 266 285 !$OMP PARALLEL PRIVATE ( i, j, k ) 267 286 !$OMP DO 287 #if __acc_fft_device 268 288 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 269 289 !$ACC PRESENT(f_out, f_inv) 290 #endif 270 291 DO k = 1, nz 271 292 DO i = nxl, nxr … … 315 336 316 337 REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< 338 #if __acc_fft_device 317 339 !$ACC DECLARE CREATE(work) 340 #endif 318 341 319 342 … … 330 353 DO l = 0, pdims(1) - 1 331 354 xs = 0 + l * nnx 355 #if __acc_fft_device 332 356 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 333 357 !$ACC PRESENT(work, f_in) 358 #endif 334 359 DO k = nzb_x, nzt_x 335 360 DO i = xs, xs + nnx - 1 … … 345 370 !-- Transpose array 346 371 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) 372 373 #if __acc_fft_device 347 374 #ifndef __cuda_aware_mpi 348 375 !$ACC UPDATE HOST(work) … … 350 377 !$ACC HOST_DATA USE_DEVICE(work, f_inv) 351 378 #endif 379 #endif 380 352 381 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 353 382 CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & 354 383 f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 355 384 comm1dx, ierr ) 385 386 #if __acc_fft_device 356 387 #ifndef __cuda_aware_mpi 357 388 !$ACC UPDATE DEVICE(f_inv) … … 361 392 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 362 393 #endif 394 #endif 363 395 364 396 ELSE … … 368 400 !$OMP PARALLEL PRIVATE ( i, j, k ) 369 401 !$OMP DO 402 #if __acc_fft_device 370 403 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 371 404 !$ACC PRESENT(f_inv, f_in) 405 #endif 372 406 DO i = nxl, nxr 373 407 DO j = nys, nyn … … 415 449 !$OMP PARALLEL PRIVATE ( i, j, k ) 416 450 !$OMP DO 451 #if __acc_fft_device 417 452 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 418 453 !$ACC PRESENT(f_out, f_inv) 454 #endif 419 455 DO i = 0, nx 420 456 DO k = nzb_x, nzt_x … … 464 500 465 501 REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< 502 #if __acc_fft_device 466 503 !$ACC DECLARE CREATE(work) 504 #endif 467 505 468 506 … … 476 514 DO l = 0, pdims(2) - 1 477 515 ys = 0 + l * ( nyn_x - nys_x + 1 ) 516 #if __acc_fft_device 478 517 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 479 518 !$ACC PRESENT(work, f_in) 519 #endif 480 520 DO i = nxl_y, nxr_y 481 521 DO k = nzb_y, nzt_y … … 491 531 !-- Transpose array 492 532 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) 533 534 #if __acc_fft_device 493 535 #ifndef __cuda_aware_mpi 494 536 !$ACC UPDATE HOST(work) … … 496 538 !$ACC HOST_DATA USE_DEVICE(work, f_inv) 497 539 #endif 540 #endif 541 498 542 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 499 543 CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & 500 544 f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 501 545 comm1dy, ierr ) 546 547 #if __acc_fft_device 502 548 #ifndef __cuda_aware_mpi 503 549 !$ACC UPDATE DEVICE(f_inv) … … 505 551 !$ACC END HOST_DATA 506 552 #endif 553 #endif 554 507 555 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 508 556 #endif … … 514 562 !$OMP PARALLEL PRIVATE ( i, j, k ) 515 563 !$OMP DO 564 #if __acc_fft_device 516 565 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 517 566 !$ACC PRESENT(f_inv, f_in) 567 #endif 518 568 DO i = nxl_y, nxr_y 519 569 DO k = nzb_y, nzt_y … … 641 691 !$OMP PARALLEL PRIVATE ( i, j, k ) 642 692 !$OMP DO 693 #if __acc_fft_device 643 694 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 644 695 !$ACC PRESENT(f_inv, f_in) 696 #endif 645 697 DO j = 0, ny 646 698 DO k = nzb_y, nzt_y … … 690 742 691 743 REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< 744 #if __acc_fft_device 692 745 !$ACC DECLARE CREATE(work) 746 #endif 693 747 694 748 … … 700 754 !$OMP PARALLEL PRIVATE ( i, j, k ) 701 755 !$OMP DO 756 #if __acc_fft_device 702 757 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 703 758 !$ACC PRESENT(f_out, f_inv) 759 #endif 704 760 DO j = 0, ny 705 761 DO k = nzb_y, nzt_y … … 717 773 !-- Transpose array 718 774 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) 775 776 #if __acc_fft_device 719 777 #ifndef __cuda_aware_mpi 720 778 !$ACC UPDATE HOST(f_inv) … … 722 780 !$ACC HOST_DATA USE_DEVICE(work, f_inv) 723 781 #endif 782 #endif 783 724 784 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 725 785 CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 726 786 work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & 727 787 comm1dx, ierr ) 788 789 #if __acc_fft_device 728 790 #ifndef __cuda_aware_mpi 729 791 !$ACC UPDATE DEVICE(work) … … 731 793 !$ACC END HOST_DATA 732 794 #endif 795 #endif 796 733 797 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 734 798 … … 739 803 DO l = 0, pdims(1) - 1 740 804 zs = 1 + l * ( nzt_y - nzb_y + 1 ) 805 #if __acc_fft_device 741 806 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 742 807 !$ACC PRESENT(f_out, work) 808 #endif 743 809 DO j = nys_z, nyn_z 744 810 DO k = zs, zs + nzt_y - nzb_y … … 785 851 !$OMP PARALLEL PRIVATE ( i, j, k ) 786 852 !$OMP DO 853 #if __acc_fft_device 787 854 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 788 855 !$ACC PRESENT(f_in, f_inv) 856 #endif 789 857 DO k = 1,nz 790 858 DO i = nxl, nxr … … 834 902 835 903 REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< 904 #if __acc_fft_device 836 905 !$ACC DECLARE CREATE(work) 906 #endif 837 907 838 908 … … 844 914 !$OMP PARALLEL PRIVATE ( i, j, k ) 845 915 !$OMP DO 916 #if __acc_fft_device 846 917 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 847 918 !$ACC PRESENT(f_out, f_inv) 919 #endif 848 920 DO k = 1, nz 849 921 DO i = nxl, nxr … … 861 933 !-- Transpose array 862 934 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) 935 936 #if __acc_fft_device 863 937 #ifndef __cuda_aware_mpi 864 938 !$ACC UPDATE HOST(f_inv) … … 866 940 !$ACC HOST_DATA USE_DEVICE(work, f_inv) 867 941 #endif 942 #endif 943 868 944 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 869 945 CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 870 946 work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & 871 947 comm1dx, ierr ) 948 949 #if __acc_fft_device 872 950 #ifndef __cuda_aware_mpi 873 951 !$ACC UPDATE DEVICE(work) … … 875 953 !$ACC END HOST_DATA 876 954 #endif 955 #endif 956 877 957 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 878 958 … … 883 963 DO l = 0, pdims(1) - 1 884 964 xs = 0 + l * nnx 965 #if __acc_fft_device 885 966 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 886 967 !$ACC PRESENT(f_out, work) 968 #endif 887 969 DO k = nzb_x, nzt_x 888 970 DO i = xs, xs + nnx - 1 … … 933 1015 !$OMP PARALLEL PRIVATE ( i, j, k ) 934 1016 !$OMP DO 1017 #if __acc_fft_device 935 1018 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 936 1019 !$ACC PRESENT(f_out, f_inv) 1020 #endif 937 1021 DO k = nzb_y, nzt_y 938 1022 DO j = 0, ny … … 982 1066 983 1067 REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< 1068 #if __acc_fft_device 984 1069 !$ACC DECLARE CREATE(work) 1070 #endif 985 1071 986 1072 ! … … 996 1082 DO l = 0, pdims(1) - 1 997 1083 zs = 1 + l * ( nzt_y - nzb_y + 1 ) 1084 #if __acc_fft_device 998 1085 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 999 1086 !$ACC PRESENT(work, f_in) 1087 #endif 1000 1088 DO j = nys_z, nyn_z 1001 1089 DO k = zs, zs + nzt_y - nzb_y … … 1011 1099 !-- Transpose array 1012 1100 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) 1101 1102 #if __acc_fft_device 1013 1103 #ifndef __cuda_aware_mpi 1014 1104 !$ACC UPDATE HOST(work) … … 1016 1106 !$ACC HOST_DATA USE_DEVICE(work, f_inv) 1017 1107 #endif 1108 #endif 1109 1018 1110 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 1019 1111 CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & 1020 1112 f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 1021 1113 comm1dx, ierr ) 1114 1115 #if __acc_fft_device 1022 1116 #ifndef __cuda_aware_mpi 1023 1117 !$ACC UPDATE DEVICE(f_inv) … … 1025 1119 !$ACC END HOST_DATA 1026 1120 #endif 1121 #endif 1122 1027 1123 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 1028 1124 #endif … … 1033 1129 !$OMP PARALLEL PRIVATE ( i, j, k ) 1034 1130 !$OMP DO 1131 #if __acc_fft_device 1035 1132 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 1036 1133 !$ACC PRESENT(f_inv, f_in) 1134 #endif 1037 1135 DO k = nzb_y, nzt_y 1038 1136 DO j = 0, ny -
palm/trunk/SOURCE/tridia_solver_mod.f90
r3655 r3690 101 101 ! the beginning, i.e. routine split is called within tridia_init. 102 102 ! 103 104 #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) ) 105 103 106 ! 104 107 ! Description: … … 173 176 CALL split 174 177 178 #if __acc_fft_device 175 179 !$ACC ENTER DATA & 176 180 !$ACC COPYIN(ddzuw(0:nz-1,1:3)) & 177 181 !$ACC COPYIN(tri(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,1:2)) 182 #endif 178 183 179 184 END SUBROUTINE tridia_init … … 297 302 298 303 REAL(wp), DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) :: ar1 !< 304 #if __acc_fft_device 299 305 !$ACC DECLARE CREATE(ar1) 306 #endif 300 307 301 308 ! 302 309 !-- Forward substitution 310 #if __acc_fft_device 303 311 !$ACC PARALLEL PRESENT(ar, ar1, tri) PRIVATE(i,j,k) 312 #endif 304 313 DO k = 0, nz - 1 314 #if __acc_fft_device 305 315 !$ACC LOOP COLLAPSE(2) 316 #endif 306 317 DO j = nys_z, nyn_z 307 318 DO i = nxl_z, nxr_z … … 316 327 ENDDO 317 328 ENDDO 329 #if __acc_fft_device 318 330 !$ACC END PARALLEL 331 #endif 319 332 320 333 ! … … 323 336 !-- by zero appearing if the pressure bc is set to neumann at the top of 324 337 !-- the model domain. 338 #if __acc_fft_device 325 339 !$ACC PARALLEL PRESENT(ar, ar1, ddzuw, tri) PRIVATE(i,j,k) 340 #endif 326 341 DO k = nz-1, 0, -1 342 #if __acc_fft_device 327 343 !$ACC LOOP COLLAPSE(2) 344 #endif 328 345 DO j = nys_z, nyn_z 329 346 DO i = nxl_z, nxr_z … … 338 355 ENDDO 339 356 ENDDO 357 #if __acc_fft_device 340 358 !$ACC END PARALLEL 359 #endif 341 360 342 361 ! … … 346 365 IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN 347 366 IF ( nys_z == 0 .AND. nxl_z == 0 ) THEN 367 #if __acc_fft_device 348 368 !$ACC PARALLEL LOOP PRESENT(ar) 369 #endif 349 370 DO k = 1, nz 350 371 ar(nxl_z,nys_z,k) = 0.0_wp -
palm/trunk/TESTS/builds/pgi_mpi_openacc/build_config.yml
r3683 r3690 10 10 - "MPI_2REAL=MPI_2DOUBLE_PRECISION" 11 11 - "__parallel" 12 - "__cuda_fft"13 12 options: 14 13 default: … … 20 19 - "-ta=tesla" 21 20 - "-Minfo=accel" 22 - "-Mcuda"23 21 debug: 24 22 - "-cpp" … … 31 29 - "-Mfree" 32 30 - "-ta=tesla" 33 - "-Mcuda"34 31 - "-Minfo=accel" 35 32 includes: [] … … 38 35 options: 39 36 - "-ta=tesla" 40 - "-Mcuda"41 - "-Mcudalib=cufft" -
palm/trunk/TESTS/cases/openacc_test/case_config.yml
r3683 r3690 5 5 - pgi_mpi_only 6 6 - pgi_mpi_openacc 7 - pgi_mpi_openacc_cufft 7 8 8 9 allowed_number_of_cores:
Note: See TracChangeset
for help on using the changeset viewer.