Changeset 1111 for palm/trunk/SOURCE/transpose.f90
- Timestamp:
- Mar 8, 2013 11:54:10 PM (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SOURCE/transpose.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! openACC directives added, 23 ! resorting data from/to work changed, work got 4 dimensions instead of 1 23 24 ! 24 25 ! Former revisions: … … 81 82 IMPLICIT NONE 82 83 83 INTEGER :: i, j, k, l, m,ys84 INTEGER :: i, j, k, l, ys 84 85 85 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 86 f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), & 87 f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 88 work(nnx*nny*nnz) 86 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) 87 88 REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work 89 90 !$acc declare create( f_inv ) 91 REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) 92 89 93 90 94 ! … … 93 97 !$OMP PARALLEL PRIVATE ( i, j, k ) 94 98 !$OMP DO 99 !$acc kernels present( f_in ) 100 !$acc loop 95 101 DO i = 0, nx 96 102 DO k = nzb_x, nzt_x 103 !$acc loop vector( 32 ) 97 104 DO j = nys_x, nyn_x 98 105 f_inv(j,k,i) = f_in(i,j,k) … … 100 107 ENDDO 101 108 ENDDO 109 !$acc end kernels 102 110 !$OMP END PARALLEL 103 111 … … 109 117 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 110 118 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 111 CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 112 work(1), sendrecvcount_xy, MPI_REAL, & 119 !$acc update host( f_inv ) 120 CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 121 work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & 113 122 comm1dy, ierr ) 123 !$acc update device( work ) 114 124 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 115 125 116 126 ! 117 127 !-- Reorder transposed array 118 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,ys )128 !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) 119 129 !$OMP DO 120 130 DO l = 0, pdims(2) - 1 121 m = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &122 ( nyn_x - nys_x + 1 )123 131 ys = 0 + l * ( nyn_x - nys_x + 1 ) 132 !$acc kernels present( f_out, work ) 133 !$acc loop 124 134 DO i = nxl_y, nxr_y 125 135 DO k = nzb_y, nzt_y 136 !$acc loop vector( 32 ) 126 137 DO j = ys, ys + nyn_x - nys_x 127 m = m + 1 128 f_out(j,i,k) = work(m) 138 f_out(j,i,k) = work(j-ys+1,k,i,l) 129 139 ENDDO 130 140 ENDDO 131 141 ENDDO 142 !$acc end kernels 132 143 ENDDO 133 144 !$OMP END PARALLEL … … 140 151 !$OMP PARALLEL PRIVATE ( i, j, k ) 141 152 !$OMP DO 153 !$acc kernels present( f_out ) 154 !$acc loop 142 155 DO k = nzb_y, nzt_y 143 156 DO i = nxl_y, nxr_y 157 !$acc loop vector( 32 ) 144 158 DO j = 0, ny 145 159 f_out(j,i,k) = f_inv(j,k,i) … … 147 161 ENDDO 148 162 ENDDO 163 !$acc end kernels 149 164 !$OMP END PARALLEL 150 165 … … 172 187 IMPLICIT NONE 173 188 174 INTEGER :: i, j, k, l, m,xs189 INTEGER :: i, j, k, l, xs 175 190 176 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 177 f_inv(nys:nyn,nxl:nxr,1:nz), & 178 f_out(1:nz,nys:nyn,nxl:nxr), & 179 work(nnx*nny*nnz) 191 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(1:nz,nys:nyn,nxl:nxr) 192 193 REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work 194 195 !$acc declare create( f_inv ) 196 REAL :: f_inv(nys:nyn,nxl:nxr,1:nz) 180 197 181 198 … … 188 205 ! 189 206 !-- Reorder input array for transposition 190 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,xs )207 !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) 191 208 !$OMP DO 192 209 DO l = 0, pdims(1) - 1 193 m = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )194 210 xs = 0 + l * nnx 211 !$acc kernels present( f_in, work ) 212 !$acc loop 195 213 DO k = nzb_x, nzt_x 196 214 DO i = xs, xs + nnx - 1 215 !$acc loop vector( 32 ) 197 216 DO j = nys_x, nyn_x 198 m = m + 1 199 work(m) = f_in(i,j,k) 217 work(j,i-xs+1,k,l) = f_in(i,j,k) 200 218 ENDDO 201 219 ENDDO 202 220 ENDDO 221 !$acc end kernels 203 222 ENDDO 204 223 !$OMP END PARALLEL … … 208 227 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 209 228 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 210 CALL MPI_ALLTOALL( work(1), sendrecvcount_zx, MPI_REAL, & 211 f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 229 !$acc update host( work ) 230 CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & 231 f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 212 232 comm1dx, ierr ) 233 !$acc update device( f_inv ) 213 234 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 214 235 … … 217 238 !$OMP PARALLEL PRIVATE ( i, j, k ) 218 239 !$OMP DO 240 !$acc kernels present( f_out ) 241 !$acc loop 219 242 DO k = 1, nz 220 243 DO i = nxl, nxr 244 !$acc loop vector( 32 ) 221 245 DO j = nys, nyn 222 246 f_out(k,j,i) = f_inv(j,i,k) … … 224 248 ENDDO 225 249 ENDDO 250 !$acc end kernels 226 251 !$OMP END PARALLEL 227 252 #endif … … 233 258 !$OMP PARALLEL PRIVATE ( i, j, k ) 234 259 !$OMP DO 260 !$acc kernels present( f_in ) 261 !$acc loop 235 262 DO i = nxl, nxr 236 263 DO j = nys, nyn 264 !$acc loop vector( 32 ) 237 265 DO k = 1, nz 238 266 f_inv(j,i,k) = f_in(i,j,k) … … 240 268 ENDDO 241 269 ENDDO 242 !$OMP END PARALLEL 243 244 !$OMP PARALLEL PRIVATE ( i, j, k ) 245 !$OMP DO 270 !$acc end kernels 271 !$OMP END PARALLEL 272 273 !$OMP PARALLEL PRIVATE ( i, j, k ) 274 !$OMP DO 275 !$acc kernels present( f_out ) 276 !$acc loop 246 277 DO k = 1, nz 247 278 DO i = nxl, nxr 279 !$acc loop vector( 32 ) 248 280 DO j = nys, nyn 249 281 f_out(k,j,i) = f_inv(j,i,k) … … 251 283 ENDDO 252 284 ENDDO 285 !$acc end kernels 253 286 !$OMP END PARALLEL 254 287 … … 276 309 IMPLICIT NONE 277 310 278 INTEGER :: i, j, k, l, m,ys311 INTEGER :: i, j, k, l, ys 279 312 280 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 281 f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), & 282 f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 283 work(nnx*nny*nnz) 313 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) 314 315 REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work 316 317 !$acc declare create( f_inv ) 318 REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) 319 284 320 285 321 IF ( numprocs /= 1 ) THEN … … 288 324 ! 289 325 !-- Reorder input array for transposition 290 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,ys )326 !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) 291 327 !$OMP DO 292 328 DO l = 0, pdims(2) - 1 293 m = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &294 ( nyn_x - nys_x + 1 )295 329 ys = 0 + l * ( nyn_x - nys_x + 1 ) 330 !$acc kernels present( f_in, work ) 331 !$acc loop 296 332 DO i = nxl_y, nxr_y 297 333 DO k = nzb_y, nzt_y 334 !$acc loop vector( 32 ) 298 335 DO j = ys, ys + nyn_x - nys_x 299 m = m + 1 300 work(m) = f_in(j,i,k) 336 work(j-ys+1,k,i,l) = f_in(j,i,k) 301 337 ENDDO 302 338 ENDDO 303 339 ENDDO 340 !$acc end kernels 304 341 ENDDO 305 342 !$OMP END PARALLEL … … 309 346 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 310 347 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 311 CALL MPI_ALLTOALL( work(1), sendrecvcount_xy, MPI_REAL, & 312 f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 348 !$acc update host( work ) 349 CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & 350 f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 313 351 comm1dy, ierr ) 352 !$acc update device( f_inv ) 314 353 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 315 354 #endif … … 321 360 !$OMP PARALLEL PRIVATE ( i, j, k ) 322 361 !$OMP DO 362 !$acc kernels present( f_in ) 363 !$acc loop 323 364 DO i = nxl_y, nxr_y 324 365 DO k = nzb_y, nzt_y 366 !$acc loop vector( 32 ) 325 367 DO j = 0, ny 326 368 f_inv(j,k,i) = f_in(j,i,k) … … 328 370 ENDDO 329 371 ENDDO 372 !$acc end kernels 330 373 !$OMP END PARALLEL 331 374 … … 336 379 !$OMP PARALLEL PRIVATE ( i, j, k ) 337 380 !$OMP DO 381 !$acc kernels present( f_out ) 382 !$acc loop 338 383 DO i = 0, nx 339 384 DO k = nzb_x, nzt_x 385 !$acc loop vector( 32 ) 340 386 DO j = nys_x, nyn_x 341 387 f_out(i,j,k) = f_inv(j,k,i) … … 343 389 ENDDO 344 390 ENDDO 391 !$acc end kernels 345 392 !$OMP END PARALLEL 346 393 … … 434 481 IMPLICIT NONE 435 482 436 INTEGER :: i, j, k, l, m,zs483 INTEGER :: i, j, k, l, zs 437 484 438 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 439 f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), & 440 f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz), & 441 work(nnx*nny*nnz) 485 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) 486 487 REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work 488 489 !$acc declare create( f_inv ) 490 REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) 491 442 492 443 493 ! … … 446 496 !$OMP PARALLEL PRIVATE ( i, j, k ) 447 497 !$OMP DO 498 !$acc kernels present( f_in ) 499 !$acc loop 448 500 DO j = 0, ny 449 501 DO k = nzb_y, nzt_y 502 !$acc loop vector( 32 ) 450 503 DO i = nxl_y, nxr_y 451 504 f_inv(i,k,j) = f_in(j,i,k) … … 453 506 ENDDO 454 507 ENDDO 508 !$acc end kernels 455 509 !$OMP END PARALLEL 456 510 … … 464 518 !$OMP PARALLEL PRIVATE ( i, j, k ) 465 519 !$OMP DO 520 !$acc kernels present( f_out ) 521 !$acc loop 466 522 DO j = 0, ny 467 523 DO k = nzb_y, nzt_y 524 !$acc loop vector( 32 ) 468 525 DO i = nxl_y, nxr_y 469 526 f_out(i,j,k) = f_inv(i,k,j) … … 471 528 ENDDO 472 529 ENDDO 530 !$acc end kernels 473 531 !$OMP END PARALLEL 474 532 … … 480 538 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 481 539 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 482 CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 483 work(1), sendrecvcount_yz, MPI_REAL, & 540 !$acc update host( f_inv ) 541 CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 542 work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & 484 543 comm1dx, ierr ) 544 !$acc update device( work ) 485 545 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 486 546 487 547 ! 488 548 !-- Reorder transposed array 489 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,zs )549 !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) 490 550 !$OMP DO 491 551 DO l = 0, pdims(1) - 1 492 m = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &493 ( nxr_z - nxl_z + 1 )494 552 zs = 1 + l * ( nzt_y - nzb_y + 1 ) 553 !$acc kernels present( f_out, work ) 554 !$acc loop 495 555 DO j = nys_z, nyn_z 496 556 DO k = zs, zs + nzt_y - nzb_y 557 !$acc loop vector( 32 ) 497 558 DO i = nxl_z, nxr_z 498 m = m + 1 499 f_out(i,j,k) = work(m) 559 f_out(i,j,k) = work(i,k-zs+1,j,l) 500 560 ENDDO 501 561 ENDDO 502 562 ENDDO 563 !$acc end kernels 503 564 ENDDO 504 565 !$OMP END PARALLEL … … 528 589 IMPLICIT NONE 529 590 530 INTEGER :: i, j, k, l, m,xs591 INTEGER :: i, j, k, l, xs 531 592 532 REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 533 work(nnx*nny*nnz) 534 535 !$acc declare create ( f_inv ) 593 REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) 594 595 REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work 596 597 !$acc declare create( f_inv ) 536 598 REAL :: f_inv(nys:nyn,nxl:nxr,1:nz) 537 599 … … 552 614 ENDDO 553 615 ENDDO 616 !$acc end kernels 554 617 !$OMP END PARALLEL 555 618 … … 573 636 ENDDO 574 637 ENDDO 638 !$acc end kernels 575 639 !$OMP END PARALLEL 576 640 … … 582 646 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 583 647 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 584 CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 585 work(1), sendrecvcount_zx, MPI_REAL, & 648 !$acc update host( f_inv ) 649 CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 650 work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & 586 651 comm1dx, ierr ) 652 !$acc update device( work ) 587 653 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 588 654 589 655 ! 590 656 !-- Reorder transposed array 591 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,xs )657 !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) 592 658 !$OMP DO 593 659 DO l = 0, pdims(1) - 1 594 m = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )595 660 xs = 0 + l * nnx 661 !$acc kernels present( f_out, work ) 662 !$acc loop 596 663 DO k = nzb_x, nzt_x 597 664 DO i = xs, xs + nnx - 1 665 !$acc loop vector( 32 ) 598 666 DO j = nys_x, nyn_x 599 m = m + 1 600 f_out(i,j,k) = work(m) 667 f_out(i,j,k) = work(j,i-xs+1,k,l) 601 668 ENDDO 602 669 ENDDO 603 670 ENDDO 671 !$acc end kernels 604 672 ENDDO 605 673 !$OMP END PARALLEL … … 629 697 IMPLICIT NONE 630 698 631 INTEGER :: i, j, k, l, m,zs699 INTEGER :: i, j, k, l, zs 632 700 633 REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), & 634 f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), & 635 f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 636 work(nnx*nny*nnz) 701 REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) 702 703 REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work 704 705 !$acc declare create( f_inv ) 706 REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) 707 637 708 638 709 ! … … 644 715 ! 645 716 !-- Reorder input array for transposition 646 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,zs )717 !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) 647 718 !$OMP DO 648 719 DO l = 0, pdims(1) - 1 649 m = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &650 ( nxr_z - nxl_z + 1 )651 720 zs = 1 + l * ( nzt_y - nzb_y + 1 ) 721 !$acc kernels present( f_in, work ) 722 !$acc loop 652 723 DO j = nys_z, nyn_z 653 724 DO k = zs, zs + nzt_y - nzb_y 725 !$acc loop vector( 32 ) 654 726 DO i = nxl_z, nxr_z 655 m = m + 1 656 work(m) = f_in(i,j,k) 727 work(i,k-zs+1,j,l) = f_in(i,j,k) 657 728 ENDDO 658 729 ENDDO 659 730 ENDDO 731 !$acc end kernels 660 732 ENDDO 661 733 !$OMP END PARALLEL … … 665 737 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 666 738 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 667 CALL MPI_ALLTOALL( work(1), sendrecvcount_yz, MPI_REAL, & 668 f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 739 !$acc update host( work ) 740 CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & 741 f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 669 742 comm1dx, ierr ) 743 !$acc update device( f_inv ) 670 744 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 671 745 #endif … … 676 750 !$OMP PARALLEL PRIVATE ( i, j, k ) 677 751 !$OMP DO 752 !$acc kernels present( f_in ) 753 !$acc loop 678 754 DO k = nzb_y, nzt_y 679 755 DO j = 0, ny 756 !$acc loop vector( 32 ) 680 757 DO i = nxl_y, nxr_y 681 758 f_inv(i,k,j) = f_in(i,j,k) … … 683 760 ENDDO 684 761 ENDDO 762 !$acc end kernels 685 763 !$OMP END PARALLEL 686 764 … … 691 769 !$OMP PARALLEL PRIVATE ( i, j, k ) 692 770 !$OMP DO 771 !$acc kernels present( f_out ) 772 !$acc loop 693 773 DO k = nzb_y, nzt_y 694 774 DO i = nxl_y, nxr_y 775 !$acc loop vector( 32 ) 695 776 DO j = 0, ny 696 777 f_out(j,i,k) = f_inv(i,k,j) … … 698 779 ENDDO 699 780 ENDDO 781 !$acc end kernels 700 782 !$OMP END PARALLEL 701 783
Note: See TracChangeset
for help on using the changeset viewer.