- Timestamp:
- Jan 9, 2020 8:12:43 AM (5 years ago)
- Location:
- palm/trunk/SOURCE
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SOURCE/Makefile
r4347 r4366 25 25 # ----------------- 26 26 # $Id$ 27 # add dependency on fft for transpose 28 # 29 # 4347 2019-12-18 13:18:33Z suehring 27 30 # add dependency to basic_constants_and_equations_mod for dynamics_mod 28 31 # … … 1160 1163 transpose.o: \ 1161 1164 cpulog_mod.o \ 1165 fft_xy_mod.o \ 1162 1166 mod_kinds.o \ 1163 1167 modules.o -
palm/trunk/SOURCE/fft_xy_mod.f90
r4360 r4366 25 25 ! ----------------- 26 26 ! $Id$ 27 ! Vectorized Temperton-fft added 28 ! 29 ! 4360 2020-01-07 11:25:50Z suehring 27 30 ! Corrected "Former revisions" section 28 31 ! … … 41 44 !> Fast Fourier transformation along x and y for 1d domain decomposition along x. 42 45 !> Original version: Klaus Ketelsen (May 2002) 46 !> @todo openmp support for vectorized Temperton fft 43 47 !------------------------------------------------------------------------------! 44 48 MODULE fft_xy … … 46 50 47 51 USE control_parameters, & 48 ONLY: fft_method, message_string52 ONLY: fft_method, loop_optimization, message_string 49 53 50 54 USE cuda_fft_interfaces … … 52 56 USE indices, & 53 57 ONLY: nx, ny, nz 54 58 55 59 #if defined( __cuda_fft ) 56 60 USE ISO_C_BINDING … … 72 76 73 77 PRIVATE 74 PUBLIC fft_x, fft_x_1d, fft_y, fft_y_1d, fft_init, fft_x_m, fft_y_m 78 PUBLIC fft_x, fft_x_1d, fft_y, fft_y_1d, fft_init, fft_x_m, fft_y_m, f_vec, temperton_fft_vec 75 79 76 80 INTEGER(iwp), DIMENSION(:), ALLOCATABLE, SAVE :: ifax_x !< 77 81 INTEGER(iwp), DIMENSION(:), ALLOCATABLE, SAVE :: ifax_y !< 78 82 79 LOGICAL, SAVE :: init_fft = .FALSE. !< 83 LOGICAL, SAVE :: init_fft = .FALSE. !< 84 LOGICAL, SAVE :: temperton_fft_vec = .FALSE. !< 80 85 81 86 REAL(wp), SAVE :: dnx !< … … 86 91 REAL(wp), DIMENSION(:), ALLOCATABLE, SAVE :: trigs_x !< 87 92 REAL(wp), DIMENSION(:), ALLOCATABLE, SAVE :: trigs_y !< 93 94 REAL(wp), DIMENSION(:,:), ALLOCATABLE, SAVE :: f_vec 88 95 89 96 #if defined( __ibm ) … … 200 207 ENDIF 201 208 209 ! 210 !-- Switch to tell the Poisson-solver that the vectorized version of Temperton-fft is to be used. 211 IF ( fft_method == 'temperton-algorithm' .AND. loop_optimization == 'vector' ) THEN 212 temperton_fft_vec = .TRUE. 213 ENDIF 214 215 202 216 #if defined( _OPENACC ) && defined( __cuda_fft ) 203 217 fft_method = 'system-specific' … … 270 284 CALL set99( trigs_y, ifax_y, ny+1 ) 271 285 286 IF ( temperton_fft_vec ) THEN 287 ALLOCATE( f_vec((nyn_x-nys_x+1)*(nzt_x-nzb_x+1),0:nx+2) ) 288 ENDIF 289 290 291 272 292 ELSEIF ( fft_method == 'fftw' ) THEN 273 293 ! … … 312 332 !------------------------------------------------------------------------------! 313 333 314 SUBROUTINE fft_x( ar, direction, ar_2d )334 SUBROUTINE fft_x( ar, direction, ar_2d, ar_inv ) 315 335 316 336 … … 325 345 INTEGER(iwp) :: j !< 326 346 INTEGER(iwp) :: k !< 347 INTEGER(iwp) :: mm !< 327 348 328 349 LOGICAL :: forward_fft !< … … 331 352 REAL(wp), DIMENSION(nx+2) :: work1 !< 332 353 354 REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_vec !< 355 REAL(wp), DIMENSION(0:nx,nys_x:nyn_x), OPTIONAL :: ar_2d !< 356 357 REAL(wp), DIMENSION(nys_x:nyn_x,nzb_x:nzt_x,0:nx), OPTIONAL :: ar_inv !< 358 REAL(wp), DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) :: ar !< 359 333 360 #if defined( __ibm ) 334 361 REAL(wp), DIMENSION(nau2) :: aux2 !< … … 337 364 REAL(wp), DIMENSION(6*(nx+1)) :: work2 !< 338 365 #elif defined( __cuda_fft ) 339 COMPLEX(dp), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) :: & 340 ar_tmp !< 366 COMPLEX(dp), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) :: ar_tmp !< 341 367 !$ACC DECLARE CREATE(ar_tmp) 342 368 #endif 343 344 REAL(wp), DIMENSION(0:nx,nys_x:nyn_x), OPTIONAL :: &345 ar_2d !<346 REAL(wp), DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) :: &347 ar !<348 369 349 370 ! … … 427 448 IF ( forward_fft ) THEN 428 449 429 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 430 !$OMP DO 431 DO k = nzb_x, nzt_x 432 DO j = nys_x, nyn_x 433 434 work(0:nx) = ar(0:nx,j,k) 435 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, -1 ) 436 437 DO i = 0, (nx+1)/2 438 ar(i,j,k) = work(2*i) 439 ENDDO 440 DO i = 1, (nx+1)/2 - 1 441 ar(nx+1-i,j,k) = work(2*i+1) 442 ENDDO 443 444 ENDDO 445 ENDDO 446 !$OMP END PARALLEL 447 448 ELSE 449 450 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 451 !$OMP DO 452 DO k = nzb_x, nzt_x 453 DO j = nys_x, nyn_x 454 455 DO i = 0, (nx+1)/2 456 work(2*i) = ar(i,j,k) 457 ENDDO 458 DO i = 1, (nx+1)/2 - 1 459 work(2*i+1) = ar(nx+1-i,j,k) 460 ENDDO 461 work(1) = 0.0_wp 462 work(nx+2) = 0.0_wp 463 464 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, 1 ) 465 ar(0:nx,j,k) = work(0:nx) 466 467 ENDDO 468 ENDDO 469 !$OMP END PARALLEL 450 IF ( .NOT. temperton_fft_vec ) THEN 451 452 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 453 !$OMP DO 454 DO k = nzb_x, nzt_x 455 DO j = nys_x, nyn_x 456 457 work(0:nx) = ar(0:nx,j,k) 458 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, -1 ) 459 460 DO i = 0, (nx+1)/2 461 ar(i,j,k) = work(2*i) 462 ENDDO 463 DO i = 1, (nx+1)/2 - 1 464 ar(nx+1-i,j,k) = work(2*i+1) 465 ENDDO 466 467 ENDDO 468 ENDDO 469 !$OMP END PARALLEL 470 471 ELSE 472 473 ! 474 !-- Vector version of the Temperton-algorithm. Computes multiple 1-D FFT's. 475 ALLOCATE( work_vec( (nyn_x-nys_x+1)*(nzt_x-nzb_x+1),nx+2) ) 476 ! 477 !-- f_vec is already set in transpose_zx 478 CALL fft991cy_vec( f_vec, work_vec, trigs_x, ifax_x, nx+1, -1 ) 479 DEALLOCATE( work_vec ) 480 481 IF ( PRESENT( ar_inv ) ) THEN 482 483 DO k = nzb_x, nzt_x 484 DO j = nys_x, nyn_x 485 mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) 486 DO i = 0, (nx+1)/2 487 ar_inv(j,k,i) = f_vec(mm,2*i) 488 ENDDO 489 DO i = 1, (nx+1)/2-1 490 ar_inv(j,k,nx+1-i) = f_vec(mm,2*i+1) 491 ENDDO 492 ENDDO 493 ENDDO 494 495 ELSE 496 497 DO k = nzb_x, nzt_x 498 DO j = nys_x, nyn_x 499 mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) 500 DO i = 0, (nx+1)/2 501 ar(i,j,k) = f_vec(mm,2*i) 502 ENDDO 503 DO i = 1, (nx+1)/2-1 504 ar(nx+1-i,j,k) = f_vec(mm,2*i+1) 505 ENDDO 506 ENDDO 507 ENDDO 508 509 ENDIF 510 511 ENDIF 512 513 ELSE 514 515 ! 516 !-- Backward fft 517 IF ( .NOT. temperton_fft_vec ) THEN 518 519 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 520 !$OMP DO 521 DO k = nzb_x, nzt_x 522 DO j = nys_x, nyn_x 523 524 DO i = 0, (nx+1)/2 525 work(2*i) = ar(i,j,k) 526 ENDDO 527 DO i = 1, (nx+1)/2 - 1 528 work(2*i+1) = ar(nx+1-i,j,k) 529 ENDDO 530 work(1) = 0.0_wp 531 work(nx+2) = 0.0_wp 532 533 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, 1 ) 534 ar(0:nx,j,k) = work(0:nx) 535 536 ENDDO 537 ENDDO 538 !$OMP END PARALLEL 539 540 ELSE 541 542 IF ( PRESENT( ar_inv ) ) THEN 543 544 DO k = nzb_x, nzt_x 545 DO j = nys_x, nyn_x 546 mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) 547 DO i = 0, (nx+1)/2 548 f_vec(mm,2*i) = ar_inv(j,k,i) 549 ENDDO 550 DO i = 1, (nx+1)/2-1 551 f_vec(mm,2*i+1) = ar_inv(j,k,nx+1-i) 552 ENDDO 553 ENDDO 554 ENDDO 555 556 ELSE 557 558 DO k = nzb_x, nzt_x 559 DO j = nys_x, nyn_x 560 mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) 561 DO i = 0, (nx+1)/2 562 f_vec(mm,2*i) = ar(i,j,k) 563 ENDDO 564 DO i = 1, (nx+1)/2-1 565 f_vec(mm,2*i+1) = ar(nx+1-i,j,k) 566 ENDDO 567 ENDDO 568 ENDDO 569 570 ENDIF 571 f_vec(:,1) = 0.0_wp 572 f_vec(:,nx+2) = 0.0_wp 573 574 ALLOCATE( work_vec((nyn_x-nys_x+1)*(nzt_x-nzb_x+1),nx+2) ) 575 CALL fft991cy_vec( f_vec, work_vec, trigs_x, ifax_x, nx+1, 1 ) 576 DEALLOCATE( work_vec ) 577 578 ENDIF 470 579 471 580 ENDIF … … 941 1050 942 1051 SUBROUTINE fft_y( ar, direction, ar_tr, nxl_y_bound, nxr_y_bound, nxl_y_l, & 943 nxr_y_l )1052 nxr_y_l, ar_inv ) 944 1053 945 1054 … … 952 1061 INTEGER(iwp) :: jshape(1) !< 953 1062 INTEGER(iwp) :: k !< 1063 INTEGER(iwp) :: mm !< 954 1064 INTEGER(iwp) :: nxl_y_bound !< 955 1065 INTEGER(iwp) :: nxl_y_l !< … … 962 1072 REAL(wp), DIMENSION(ny+2) :: work1 !< 963 1073 1074 REAL(wp), DIMENSION(:,:), ALLOCATABLE :: f_vec 1075 REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_vec 1076 1077 REAL(wp), DIMENSION(0:ny,nxl_y_l:nxr_y_l,nzb_y:nzt_y) :: ar !< 1078 REAL(wp), DIMENSION(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), OPTIONAL :: ar_inv !< 1079 REAL(wp), DIMENSION(0:ny,nxl_y_bound:nxr_y_bound,nzb_y:nzt_y), OPTIONAL :: ar_tr !< 1080 964 1081 COMPLEX(wp), DIMENSION(:), ALLOCATABLE :: cwork !< 965 1082 … … 975 1092 #endif 976 1093 977 REAL(wp), DIMENSION(0:ny,nxl_y_l:nxr_y_l,nzb_y:nzt_y) :: &978 ar !<979 REAL(wp), DIMENSION(0:ny,nxl_y_bound:nxr_y_bound,nzb_y:nzt_y) :: &980 ar_tr !<981 1094 982 1095 IF ( direction == 'forward' ) THEN … … 1057 1170 IF ( forward_fft ) THEN 1058 1171 1059 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 1060 !$OMP DO 1061 DO k = nzb_y, nzt_y 1062 DO i = nxl_y_l, nxr_y_l 1063 1064 work(0:ny) = ar(0:ny,i,k) 1065 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, -1 ) 1066 1067 DO j = 0, (ny+1)/2 1068 ar_tr(j,i,k) = work(2*j) 1069 ENDDO 1070 DO j = 1, (ny+1)/2 - 1 1071 ar_tr(ny+1-j,i,k) = work(2*j+1) 1072 ENDDO 1073 1074 ENDDO 1075 ENDDO 1076 !$OMP END PARALLEL 1077 1078 ELSE 1079 1080 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 1081 !$OMP DO 1082 DO k = nzb_y, nzt_y 1083 DO i = nxl_y_l, nxr_y_l 1084 1085 DO j = 0, (ny+1)/2 1086 work(2*j) = ar_tr(j,i,k) 1087 ENDDO 1088 DO j = 1, (ny+1)/2 - 1 1089 work(2*j+1) = ar_tr(ny+1-j,i,k) 1090 ENDDO 1091 work(1) = 0.0_wp 1092 work(ny+2) = 0.0_wp 1093 1094 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, 1 ) 1095 ar(0:ny,i,k) = work(0:ny) 1096 1097 ENDDO 1098 ENDDO 1099 !$OMP END PARALLEL 1172 IF ( .NOT. temperton_fft_vec ) THEN 1173 1174 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 1175 !$OMP DO 1176 DO k = nzb_y, nzt_y 1177 DO i = nxl_y_l, nxr_y_l 1178 1179 work(0:ny) = ar(0:ny,i,k) 1180 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, -1 ) 1181 1182 DO j = 0, (ny+1)/2 1183 ar_tr(j,i,k) = work(2*j) 1184 ENDDO 1185 DO j = 1, (ny+1)/2 - 1 1186 ar_tr(ny+1-j,i,k) = work(2*j+1) 1187 ENDDO 1188 1189 ENDDO 1190 ENDDO 1191 !$OMP END PARALLEL 1192 1193 ELSE 1194 ! 1195 !-- Vector version of Temperton-fft. Computes multiple 1-D FFT's. 1196 ALLOCATE( f_vec((nxr_y_l-nxl_y_l+1)*(nzt_y-nzb_y+1),0:ny+2) ) 1197 1198 mm = 1 1199 DO k = nzb_y, nzt_y 1200 DO i = nxl_y_l, nxr_y_l 1201 f_vec(mm,0:nx) = ar(0:nx,i,k) 1202 mm = mm+1 1203 ENDDO 1204 ENDDO 1205 1206 ALLOCATE( work_vec( (nxr_y_l-nxl_y_l+1)*(nzt_y-nzb_y+1),ny+2) ) 1207 CALL fft991cy_vec( f_vec, work_vec, trigs_y, ifax_y, ny+1, -1 ) 1208 DEALLOCATE( work_vec ) 1209 1210 IF( PRESENT( ar_inv ) ) THEN 1211 1212 DO k = nzb_y, nzt_y 1213 DO i = nxl_y_l, nxr_y_l 1214 mm = i-nxl_y_l+1+(k-nzb_y)*(nxr_y_l-nxl_y_l+1) 1215 DO j = 0, (ny+1)/2 1216 ar_inv(i,k,j) = f_vec(mm,2*j) 1217 ENDDO 1218 DO j = 1, (ny+1)/2 - 1 1219 ar_inv(i,k,ny+1-j) = f_vec(mm,2*j+1) 1220 ENDDO 1221 ENDDO 1222 ENDDO 1223 1224 ELSE 1225 1226 DO k = nzb_y, nzt_y 1227 DO i = nxl_y_l, nxr_y_l 1228 mm = i-nxl_y_l+1+(k-nzb_y)*(nxr_y_l-nxl_y_l+1) 1229 DO j = 0, (ny+1)/2 1230 ar(j,i,k) = f_vec(mm,2*j) 1231 ENDDO 1232 DO j = 1, (ny+1)/2 - 1 1233 ar(ny+1-j,i,k) = f_vec(mm,2*j+1) 1234 ENDDO 1235 ENDDO 1236 ENDDO 1237 1238 ENDIF 1239 1240 DEALLOCATE( f_vec ) 1241 1242 ENDIF 1243 1244 ELSE 1245 1246 IF ( .NOT. temperton_fft_vec ) THEN 1247 1248 !$OMP PARALLEL PRIVATE ( work, work1, i, j, k ) 1249 !$OMP DO 1250 DO k = nzb_y, nzt_y 1251 DO i = nxl_y_l, nxr_y_l 1252 1253 DO j = 0, (ny+1)/2 1254 work(2*j) = ar_tr(j,i,k) 1255 ENDDO 1256 DO j = 1, (ny+1)/2 - 1 1257 work(2*j+1) = ar_tr(ny+1-j,i,k) 1258 ENDDO 1259 work(1) = 0.0_wp 1260 work(ny+2) = 0.0_wp 1261 1262 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, 1 ) 1263 ar(0:ny,i,k) = work(0:ny) 1264 1265 ENDDO 1266 ENDDO 1267 !$OMP END PARALLEL 1268 1269 ELSE 1270 1271 ALLOCATE( f_vec((nxr_y_l-nxl_y_l+1)*(nzt_y-nzb_y+1),0:ny+2) ) 1272 1273 IF ( PRESENT( ar_inv ) ) THEN 1274 1275 DO k = nzb_y, nzt_y 1276 DO i = nxl_y_l, nxr_y_l 1277 mm = i-nxl_y_l+1+(k-nzb_y)*(nxr_y_l-nxl_y_l+1) 1278 DO j = 0, (ny+1)/2 1279 f_vec(mm,2*j) = ar_inv(i,k,j) 1280 ENDDO 1281 DO j = 1, (ny+1)/2 - 1 1282 f_vec(mm,2*j+1) = ar_inv(i,k,ny+1-j) 1283 ENDDO 1284 ENDDO 1285 ENDDO 1286 1287 ELSE 1288 1289 DO k = nzb_y, nzt_y 1290 DO i = nxl_y_l, nxr_y_l 1291 mm = i-nxl_y_l+1+(k-nzb_y)*(nxr_y_l-nxl_y_l+1) 1292 DO j = 0, (ny+1)/2 1293 f_vec(mm,2*j) = ar(j,i,k) 1294 ENDDO 1295 DO j = 1, (ny+1)/2 - 1 1296 f_vec(mm,2*j+1) = ar(ny+1-j,i,k) 1297 ENDDO 1298 ENDDO 1299 ENDDO 1300 1301 ENDIF 1302 1303 f_vec(:,1) = 0.0_wp 1304 f_vec(:,ny+2) = 0.0_wp 1305 1306 ALLOCATE( work_vec((nxr_y_l-nxl_y_l+1)*(nzt_y-nzb_y+1),ny+2) ) 1307 CALL fft991cy_vec( f_vec, work_vec, trigs_y, ifax_y, ny+1, 1 ) 1308 DEALLOCATE( work_vec ) 1309 1310 mm = 1 1311 DO k = nzb_y, nzt_y 1312 DO i = nxl_y_l, nxr_y_l 1313 ar(0:ny,i,k) = f_vec(mm,0:ny) 1314 mm = mm+1 1315 ENDDO 1316 ENDDO 1317 1318 DEALLOCATE( f_vec ) 1319 1320 ENDIF 1100 1321 1101 1322 ENDIF -
palm/trunk/SOURCE/poisfft_mod.f90
r4360 r4366 25 25 ! ----------------- 26 26 ! $Id$ 27 ! modification concerning NEC vectorizatio 28 ! 29 ! 4360 2020-01-07 11:25:50Z suehring 27 30 ! Corrected "Former revisions" section 28 31 ! … … 50 53 51 54 USE fft_xy, & 52 ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m 55 ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m, & 56 temperton_fft_vec 53 57 54 58 USE indices, & … … 207 211 208 212 CALL cpu_log( log_point_s(4), 'fft_x', 'start' ) 209 CALL fft_x( ar, 'forward' ) 213 IF ( temperton_fft_vec ) THEN 214 ! 215 !-- Vector version outputs a transformed array ar_inv that does not require resorting 216 !-- (which is done for ar further below) 217 CALL fft_x( ar, 'forward', ar_inv=ar_inv) 218 ELSE 219 CALL fft_x( ar, 'forward') 220 ENDIF 210 221 CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) 211 222 … … 213 224 !-- Transposition x --> y 214 225 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) 215 CALL resort_for_xy( ar, ar_inv )226 IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv ) 216 227 CALL transpose_xy( ar_inv, ar ) 217 228 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) 218 229 219 230 CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) 220 CALL fft_y( ar, 'forward', ar_tr = ar, & 221 nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 222 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 231 IF ( temperton_fft_vec ) THEN 232 ! 233 !-- Input array ar_inv from fft_x can be directly used here. 234 !-- The output (also in array ar_inv) does not require resorting below. 235 CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 236 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 237 ELSE 238 CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 239 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 240 ENDIF 223 241 CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) 224 242 … … 226 244 !-- Transposition y --> z 227 245 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) 228 CALL resort_for_yz( ar, ar_inv )246 IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv ) 229 247 CALL transpose_yz( ar_inv, ar ) 230 248 CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' ) … … 241 259 CALL cpu_log( log_point_s(8), 'transpo invers', 'start' ) 242 260 CALL transpose_zy( ar, ar_inv ) 243 CALL resort_for_zy( ar_inv, ar ) 261 ! 262 !-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a 263 !-- resorting) 264 IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar ) 244 265 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 245 266 246 267 CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) 247 CALL fft_y( ar, 'backward', ar_tr = ar, & 248 nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 249 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 268 IF ( temperton_fft_vec ) THEN 269 ! 270 !-- Output array ar_inv can be used as input to the below fft_x routine without resorting 271 CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,& 272 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 273 ELSE 274 CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & 275 nxl_y_l = nxl_y, nxr_y_l = nxr_y ) 276 ENDIF 277 250 278 CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) 251 279 … … 254 282 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) 255 283 CALL transpose_yx( ar, ar_inv ) 256 CALL resort_for_yx( ar_inv, ar )284 IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar ) 257 285 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) 258 286 259 287 CALL cpu_log( log_point_s(4), 'fft_x', 'continue' ) 260 CALL fft_x( ar, 'backward' ) 288 IF ( temperton_fft_vec ) THEN 289 CALL fft_x( ar, 'backward', ar_inv=ar_inv ) 290 ELSE 291 CALL fft_x( ar, 'backward' ) 292 ENDIF 261 293 CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) 262 294 -
palm/trunk/SOURCE/surface_layer_fluxes_mod.f90
r4360 r4366 26 26 ! ----------------- 27 27 ! $Id$ 28 ! vector version for calculation of Obukhov length via Newton iteration added 29 ! 30 ! 4360 2020-01-07 11:25:50Z suehring 28 31 ! Calculation of diagnostic-only 2-m potential temperature moved to 29 32 ! diagnostic_output_quantities. … … 112 115 constant_waterflux, coupling_mode, & 113 116 debug_output_timestep, & 114 humidity, 117 humidity, loop_optimization, & 115 118 ibc_e_b, ibc_pt_b, indoor_model, & 116 119 land_surface, large_scale_forcing, lsf_surf, message_string, & … … 859 862 INTEGER(iwp) :: m !< loop variable over all horizontal wall elements 860 863 864 LOGICAL, DIMENSION(surf%ns) :: convergence_reached !< convergence switch for vectorization 865 861 866 REAL(wp) :: f, & !< Function for Newton iteration: f = Ri - [...]/[...]^2 = 0 862 867 f_d_ol, & !< Derivative of f … … 866 871 ol_u !< Upper bound of L for Newton iteration 867 872 873 REAL(wp), DIMENSION(surf%ns) :: ol_old_vec !< temporary array required for vectorization 874 REAL(wp), DIMENSION(surf%ns) :: z_mo_vec !< temporary array required for vectorization 875 868 876 ! 869 877 !-- Evaluate bulk Richardson number (calculation depends on … … 926 934 ENDIF 927 935 928 ! 929 !-- Calculate the Obukhov length using Newton iteration 930 !$OMP PARALLEL DO PRIVATE(i, j, z_mo) & 931 !$OMP PRIVATE(ol_old, ol_m, ol_l, ol_u, f, f_d_ol) 932 !$ACC PARALLEL LOOP PRIVATE(i, j, z_mo) & 933 !$ACC PRIVATE(ol_old, ol_m, ol_l, ol_u, f, f_d_ol) & 934 !$ACC PRESENT(surf) 935 DO m = 1, surf%ns 936 937 i = surf%i(m) 938 j = surf%j(m) 939 940 z_mo = surf%z_mo(m) 941 942 ! 943 !-- Store current value in case the Newton iteration fails 944 ol_old = surf%ol(m) 945 946 ! 947 !-- Ensure that the bulk Richardson number and the Obukhov 948 !-- length have the same sign 949 IF ( surf%rib(m) * surf%ol(m) < 0.0_wp .OR. & 950 ABS( surf%ol(m) ) == ol_max ) THEN 951 IF ( surf%rib(m) > 1.0_wp ) surf%ol(m) = 0.01_wp 952 IF ( surf%rib(m) < 0.0_wp ) surf%ol(m) = -0.01_wp 953 ENDIF 936 IF ( loop_optimization == 'cache' ) THEN 937 ! 938 !-- Calculate the Obukhov length using Newton iteration 939 !$OMP PARALLEL DO PRIVATE(i, j, z_mo) & 940 !$OMP PRIVATE(ol_old, ol_m, ol_l, ol_u, f, f_d_ol) 941 !$ACC PARALLEL LOOP PRIVATE(i, j, z_mo) & 942 !$ACC PRIVATE(ol_old, ol_m, ol_l, ol_u, f, f_d_ol) & 943 !$ACC PRESENT(surf) 944 DO m = 1, surf%ns 945 946 i = surf%i(m) 947 j = surf%j(m) 948 949 z_mo = surf%z_mo(m) 950 951 ! 952 !-- Store current value in case the Newton iteration fails 953 ol_old = surf%ol(m) 954 955 ! 956 !-- Ensure that the bulk Richardson number and the Obukhov 957 !-- length have the same sign 958 IF ( surf%rib(m) * surf%ol(m) < 0.0_wp .OR. ABS( surf%ol(m) ) == ol_max ) THEN 959 IF ( surf%rib(m) > 1.0_wp ) surf%ol(m) = 0.01_wp 960 IF ( surf%rib(m) < 0.0_wp ) surf%ol(m) = -0.01_wp 961 ENDIF 962 ! 963 !-- Iteration to find Obukhov length 964 iter = 0 965 DO 966 iter = iter + 1 967 ! 968 !-- In case of divergence, use the value of the previous time step 969 IF ( iter > 1000 ) THEN 970 surf%ol(m) = ol_old 971 EXIT 972 ENDIF 973 974 ol_m = surf%ol(m) 975 ol_l = ol_m - 0.001_wp * ol_m 976 ol_u = ol_m + 0.001_wp * ol_m 977 978 979 IF ( ibc_pt_b /= 1 ) THEN 980 ! 981 !-- Calculate f = Ri - [...]/[...]^2 = 0 982 f = surf%rib(m) - ( z_mo / ol_m ) * ( LOG( z_mo / surf%z0h(m) ) & 983 - psi_h( z_mo / ol_m ) & 984 + psi_h( surf%z0h(m) / ol_m ) & 985 ) / & 986 ( LOG( z_mo / surf%z0(m) ) & 987 - psi_m( z_mo / ol_m ) & 988 + psi_m( surf%z0(m) / ol_m ) & 989 )**2 990 991 ! 992 !-- Calculate df/dL 993 f_d_ol = ( - ( z_mo / ol_u ) * ( LOG( z_mo / surf%z0h(m) ) & 994 - psi_h( z_mo / ol_u ) & 995 + psi_h( surf%z0h(m) / ol_u ) & 996 ) / & 997 ( LOG( z_mo / surf%z0(m) ) & 998 - psi_m( z_mo / ol_u ) & 999 + psi_m( surf%z0(m) / ol_u ) & 1000 )**2 & 1001 + ( z_mo / ol_l ) * ( LOG( z_mo / surf%z0h(m) ) & 1002 - psi_h( z_mo / ol_l ) & 1003 + psi_h( surf%z0h(m) / ol_l ) & 1004 ) / & 1005 ( LOG( z_mo / surf%z0(m) ) & 1006 - psi_m( z_mo / ol_l ) & 1007 + psi_m( surf%z0(m) / ol_l ) & 1008 )**2 & 1009 ) / ( ol_u - ol_l ) 1010 ELSE 1011 ! 1012 !-- Calculate f = Ri - 1 /[...]^3 = 0 1013 f = surf%rib(m) - ( z_mo / ol_m ) / ( LOG( z_mo / surf%z0(m) ) & 1014 - psi_m( z_mo / ol_m ) & 1015 + psi_m( surf%z0(m) / ol_m ) & 1016 )**3 1017 1018 ! 1019 !-- Calculate df/dL 1020 f_d_ol = ( - ( z_mo / ol_u ) / ( LOG( z_mo / surf%z0(m) ) & 1021 - psi_m( z_mo / ol_u ) & 1022 + psi_m( surf%z0(m) / ol_u ) & 1023 )**3 & 1024 + ( z_mo / ol_l ) / ( LOG( z_mo / surf%z0(m) ) & 1025 - psi_m( z_mo / ol_l ) & 1026 + psi_m( surf%z0(m) / ol_l ) & 1027 )**3 & 1028 ) / ( ol_u - ol_l ) 1029 ENDIF 1030 ! 1031 !-- Calculate new L 1032 surf%ol(m) = ol_m - f / f_d_ol 1033 1034 ! 1035 !-- Ensure that the bulk Richardson number and the Obukhov 1036 !-- length have the same sign and ensure convergence. 1037 IF ( surf%ol(m) * ol_m < 0.0_wp ) surf%ol(m) = ol_m * 0.5_wp 1038 ! 1039 !-- If unrealistic value occurs, set L to the maximum 1040 !-- value that is allowed 1041 IF ( ABS( surf%ol(m) ) > ol_max ) THEN 1042 surf%ol(m) = ol_max 1043 EXIT 1044 ENDIF 1045 ! 1046 !-- Assure that Obukhov length does not become zero. If the limit is 1047 !-- reached, exit the loop. 1048 IF ( ABS( surf%ol(m) ) < 1E-5_wp ) THEN 1049 surf%ol(m) = SIGN( 1E-5_wp, surf%ol(m) ) 1050 EXIT 1051 ENDIF 1052 ! 1053 !-- Check for convergence 1054 IF ( ABS( ( surf%ol(m) - ol_m ) / surf%ol(m) ) < 1.0E-4_wp ) EXIT 1055 1056 ENDDO 1057 ENDDO 1058 1059 ! 1060 !-- Vector Version 1061 ELSE 1062 ! 1063 !-- Calculate the Obukhov length using Newton iteration 1064 !-- First set arrays required for vectorization 1065 DO m = 1, surf%ns 1066 1067 z_mo_vec(m) = surf%z_mo(m) 1068 1069 ! 1070 !-- Store current value in case the Newton iteration fails 1071 ol_old_vec(m) = surf%ol(m) 1072 1073 ! 1074 !-- Ensure that the bulk Richardson number and the Obukhov length have the same sign 1075 IF ( surf%rib(m) * surf%ol(m) < 0.0_wp .OR. ABS( surf%ol(m) ) == ol_max ) THEN 1076 IF ( surf%rib(m) > 1.0_wp ) surf%ol(m) = 0.01_wp 1077 IF ( surf%rib(m) < 0.0_wp ) surf%ol(m) = -0.01_wp 1078 ENDIF 1079 1080 ENDDO 1081 954 1082 ! 955 1083 !-- Iteration to find Obukhov length 1084 convergence_reached(:) = .FALSE. 956 1085 iter = 0 957 1086 DO 1087 958 1088 iter = iter + 1 959 1089 ! 960 !-- In case of divergence, use the value of the previous time step1090 !-- In case of divergence, use the value(s) of the previous time step 961 1091 IF ( iter > 1000 ) THEN 962 surf%ol(m) = ol_old 1092 DO m = 1, surf%ns 1093 IF ( .NOT. convergence_reached(m) ) surf%ol(1:surf%ns) = ol_old 1094 ENDDO 963 1095 EXIT 964 1096 ENDIF 965 1097 966 ol_m = surf%ol(m) 967 ol_l = ol_m - 0.001_wp * ol_m 968 ol_u = ol_m + 0.001_wp * ol_m 969 970 971 IF ( ibc_pt_b /= 1 ) THEN 972 ! 973 !-- Calculate f = Ri - [...]/[...]^2 = 0 974 f = surf%rib(m) - ( z_mo / ol_m ) * ( & 975 LOG( z_mo / surf%z0h(m) ) & 976 - psi_h( z_mo / ol_m ) & 977 + psi_h( surf%z0h(m) / & 978 ol_m ) & 979 ) & 980 / ( LOG( z_mo / surf%z0(m) ) & 981 - psi_m( z_mo / ol_m ) & 982 + psi_m( surf%z0(m) / ol_m ) & 983 )**2 984 985 ! 986 !-- Calculate df/dL 987 f_d_ol = ( - ( z_mo / ol_u ) * ( LOG( z_mo / & 988 surf%z0h(m) ) & 989 - psi_h( z_mo / ol_u ) & 990 + psi_h( surf%z0h(m) / ol_u ) & 991 ) & 992 / ( LOG( z_mo / surf%z0(m) ) & 993 - psi_m( z_mo / ol_u ) & 994 + psi_m( surf%z0(m) / ol_u ) & 995 )**2 & 996 + ( z_mo / ol_l ) * ( LOG( z_mo / surf%z0h(m) ) & 997 - psi_h( z_mo / ol_l ) & 998 + psi_h( surf%z0h(m) / ol_l ) & 999 ) & 1000 / ( LOG( z_mo / surf%z0(m) ) & 1001 - psi_m( z_mo / ol_l ) & 1002 + psi_m( surf%z0(m) / ol_l ) & 1003 )**2 & 1004 ) / ( ol_u - ol_l ) 1005 ELSE 1006 ! 1007 !-- Calculate f = Ri - 1 /[...]^3 = 0 1008 f = surf%rib(m) - ( z_mo / ol_m ) / & 1009 ( LOG( z_mo / surf%z0(m) ) & 1010 - psi_m( z_mo / ol_m ) & 1011 + psi_m( surf%z0(m) / ol_m ) & 1012 )**3 1013 1014 ! 1015 !-- Calculate df/dL 1016 f_d_ol = ( - ( z_mo / ol_u ) / ( LOG( z_mo / surf%z0(m) ) & 1017 - psi_m( z_mo / ol_u ) & 1018 + psi_m( surf%z0(m) / ol_u ) & 1019 )**3 & 1020 + ( z_mo / ol_l ) / ( LOG( z_mo / surf%z0(m) ) & 1021 - psi_m( z_mo / ol_l ) & 1022 + psi_m( surf%z0(m) / ol_l ) & 1023 )**3 & 1024 ) / ( ol_u - ol_l ) 1025 ENDIF 1026 ! 1027 !-- Calculate new L 1028 surf%ol(m) = ol_m - f / f_d_ol 1029 1030 ! 1031 !-- Ensure that the bulk Richardson number and the Obukhov 1032 !-- length have the same sign and ensure convergence. 1033 IF ( surf%ol(m) * ol_m < 0.0_wp ) surf%ol(m) = ol_m * 0.5_wp 1034 ! 1035 !-- If unrealistic value occurs, set L to the maximum 1036 !-- value that is allowed 1037 IF ( ABS( surf%ol(m) ) > ol_max ) THEN 1038 surf%ol(m) = ol_max 1039 EXIT 1040 ENDIF 1041 ! 1042 !-- Assure that Obukhov length does not become zero. If the limit is 1043 !-- reached, exit the loop. 1044 IF ( ABS( surf%ol(m) ) < 1E-5_wp ) THEN 1045 surf%ol(m) = SIGN( 1E-5_wp, surf%ol(m) ) 1046 EXIT 1047 ENDIF 1048 ! 1049 !-- Check for convergence 1050 IF ( ABS( ( surf%ol(m) - ol_m ) / surf%ol(m) ) < 1.0E-4_wp ) THEN 1051 EXIT 1052 ELSE 1053 CYCLE 1054 ENDIF 1055 1056 ENDDO 1057 ENDDO 1098 1099 DO m = 1, surf%ns 1100 1101 IF ( convergence_reached(m) ) CYCLE 1102 1103 ol_m = surf%ol(m) 1104 ol_l = ol_m - 0.001_wp * ol_m 1105 ol_u = ol_m + 0.001_wp * ol_m 1106 1107 1108 IF ( ibc_pt_b /= 1 ) THEN 1109 ! 1110 !-- Calculate f = Ri - [...]/[...]^2 = 0 1111 f = surf%rib(m) - ( z_mo_vec(m) / ol_m ) * ( LOG( z_mo_vec(m) / surf%z0h(m) ) & 1112 - psi_h( z_mo_vec(m) / ol_m ) & 1113 + psi_h( surf%z0h(m) / ol_m ) & 1114 ) / & 1115 ( LOG( z_mo_vec(m) / surf%z0(m) ) & 1116 - psi_m( z_mo_vec(m) / ol_m ) & 1117 + psi_m( surf%z0(m) / ol_m ) & 1118 )**2 1119 1120 ! 1121 !-- Calculate df/dL 1122 f_d_ol = ( - ( z_mo_vec(m) / ol_u ) * ( LOG( z_mo_vec(m) / surf%z0h(m) ) & 1123 - psi_h( z_mo_vec(m) / ol_u ) & 1124 + psi_h( surf%z0h(m) / ol_u ) & 1125 ) / & 1126 ( LOG( z_mo_vec(m) / surf%z0(m) ) & 1127 - psi_m( z_mo_vec(m) / ol_u ) & 1128 + psi_m( surf%z0(m) / ol_u ) & 1129 )**2 & 1130 + ( z_mo_vec(m) / ol_l ) * ( LOG( z_mo_vec(m) / surf%z0h(m) ) & 1131 - psi_h( z_mo_vec(m) / ol_l ) & 1132 + psi_h( surf%z0h(m) / ol_l ) & 1133 ) / & 1134 ( LOG( z_mo_vec(m) / surf%z0(m) ) & 1135 - psi_m( z_mo_vec(m) / ol_l ) & 1136 + psi_m( surf%z0(m) / ol_l ) & 1137 )**2 & 1138 ) / ( ol_u - ol_l ) 1139 ELSE 1140 ! 1141 !-- Calculate f = Ri - 1 /[...]^3 = 0 1142 f = surf%rib(m) - ( z_mo_vec(m) / ol_m ) / ( LOG( z_mo_vec(m) / surf%z0(m) ) & 1143 - psi_m( z_mo_vec(m) / ol_m ) & 1144 + psi_m( surf%z0(m) / ol_m ) & 1145 )**3 1146 1147 ! 1148 !-- Calculate df/dL 1149 f_d_ol = ( - ( z_mo_vec(m) / ol_u ) / ( LOG( z_mo_vec(m) / surf%z0(m) ) & 1150 - psi_m( z_mo_vec(m) / ol_u ) & 1151 + psi_m( surf%z0(m) / ol_u ) & 1152 )**3 & 1153 + ( z_mo_vec(m) / ol_l ) / ( LOG( z_mo_vec(m) / surf%z0(m) ) & 1154 - psi_m( z_mo_vec(m) / ol_l ) & 1155 + psi_m( surf%z0(m) / ol_l ) & 1156 )**3 & 1157 ) / ( ol_u - ol_l ) 1158 ENDIF 1159 ! 1160 !-- Calculate new L 1161 surf%ol(m) = ol_m - f / f_d_ol 1162 1163 ! 1164 !-- Ensure that the bulk Richardson number and the Obukhov 1165 !-- length have the same sign and ensure convergence. 1166 IF ( surf%ol(m) * ol_m < 0.0_wp ) surf%ol(m) = ol_m * 0.5_wp 1167 1168 ! 1169 !-- Check for convergence 1170 !-- This check does not modify surf%ol, therefore this is done first 1171 IF ( ABS( ( surf%ol(m) - ol_m ) / surf%ol(m) ) < 1.0E-4_wp ) THEN 1172 convergence_reached(m) = .TRUE. 1173 ENDIF 1174 ! 1175 !-- If unrealistic value occurs, set L to the maximum allowed value 1176 IF ( ABS( surf%ol(m) ) > ol_max ) THEN 1177 surf%ol(m) = ol_max 1178 convergence_reached(m) = .TRUE. 1179 ENDIF 1180 1181 ENDDO 1182 ! 1183 !-- Assure that Obukhov length does not become zero 1184 DO m = 1, surf%ns 1185 IF ( convergence_reached(m) ) CYCLE 1186 IF ( ABS( surf%ol(m) ) < 1E-5_wp ) THEN 1187 surf%ol(m) = SIGN( 10E-6_wp, surf%ol(m) ) 1188 convergence_reached(m) = .TRUE. 1189 ENDIF 1190 ENDDO 1191 1192 IF ( ALL( convergence_reached ) ) EXIT 1193 1194 ENDDO ! end of iteration loop 1195 1196 ENDIF ! end of vector branch 1058 1197 1059 1198 END SUBROUTINE calc_ol -
palm/trunk/SOURCE/surface_mod.f90
r4360 r4366 26 26 ! ----------------- 27 27 ! $Id$ 28 ! workaround implemented to avoid vectorization bug on NEC Aurora 29 ! 30 ! 4360 2020-01-07 11:25:50Z suehring 28 31 ! Fix also remaining message calls. 29 32 ! … … 2375 2378 num_def_h_kji(0) - 1 2376 2379 start_index_def_h(0) = surf_def_h(0)%end_index(j,i) + 1 2380 ! 2381 !-- ATTENTION: 2382 !-- workaround to prevent vectorization bug on NEC Aurora 2383 IF ( start_index_def_h(0) < -99999 ) THEN 2384 PRINT*, 'i=', i, ' j=',j, ' s=',surf_def_h(0)%start_index(j,i), & 2385 ' e=', surf_def_h(0)%end_index(j,i) 2386 ENDIF 2377 2387 ! 2378 2388 !-- Downward-facing surfaces, except model top -
palm/trunk/SOURCE/temperton_fft_mod.f90
r4182 r4366 9 9 ! ----------------- 10 10 ! $Id$ 11 ! vectorized routines added 12 ! 13 ! 4182 2019-08-22 15:20:23Z scharf 11 14 ! Corrected "Former revisions" section 12 15 ! … … 30 33 PRIVATE 31 34 32 PUBLIC set99, fft991cy 35 ! 36 !-- No interfaces for the serial routines, because these are still writte in FORTRAN77 37 INTERFACE fft991cy_vec 38 MODULE PROCEDURE fft991cy_vec 39 END INTERFACE fft991cy_vec 40 41 INTERFACE qpassm_vec 42 MODULE PROCEDURE qpassm_vec 43 END INTERFACE qpassm_vec 44 45 INTERFACE rpassm_vec 46 MODULE PROCEDURE rpassm_vec 47 END INTERFACE rpassm_vec 48 49 PUBLIC set99, fft991cy, fft991cy_vec 33 50 34 51 INTEGER(iwp), PARAMETER :: nfft = 32 !< maximum length of calls to *fft … … 2197 2214 END SUBROUTINE set99 2198 2215 2216 2217 SUBROUTINE fft991cy_vec( a, work, trigs, ifax, n, isign ) 2218 2219 USE kinds 2220 2221 IMPLICIT NONE 2222 2223 REAL(wp),DIMENSION(:,:) :: a !< 2224 REAL(wp),DIMENSION(:) :: trigs !< 2225 REAL(wp),DIMENSION(:,:) :: work !< 2226 INTEGER(iwp),DIMENSION(:),INTENT(IN) :: ifax !< 2227 2228 INTEGER(iwp) :: inc !< 2229 INTEGER(iwp) :: isign !< 2230 INTEGER(iwp) :: jump !< 2231 INTEGER(iwp) :: lot !< 2232 INTEGER(iwp) :: n !< 2233 2234 2235 INTEGER(iwp) :: i !< 2236 INTEGER(iwp) :: ia !< 2237 INTEGER(iwp) :: ibase !< 2238 INTEGER(iwp) :: ierr !< 2239 INTEGER(iwp) :: ifac !< 2240 INTEGER(iwp) :: igo !< 2241 INTEGER(iwp) :: ii !< 2242 INTEGER(iwp) :: istart !< 2243 INTEGER(iwp) :: ix !< 2244 INTEGER(iwp) :: iz !< 2245 INTEGER(iwp) :: j !< 2246 INTEGER(iwp) :: jbase !< 2247 INTEGER(iwp) :: jj !< 2248 INTEGER(iwp) :: k !< 2249 INTEGER(iwp) :: la !< 2250 INTEGER(iwp) :: nb !< 2251 INTEGER(iwp) :: nblox !< 2252 INTEGER(iwp) :: nfax !< 2253 INTEGER(iwp) :: nvex !< 2254 INTEGER(iwp) :: nx !< 2255 INTEGER(iwp) :: mm !< 2256 2257 2258 inc = 1 2259 jump = n 2260 lot = 1 2261 2262 IF ( ifax(10) /= n ) CALL set99( trigs, ifax, n ) 2263 nfax = ifax(1) 2264 nx = n + 1 2265 IF ( MOD(n,2) == 1 ) nx = n 2266 nblox = 1 2267 nvex = 1 2268 2269 IF ( isign == 1 ) THEN 2270 ! 2271 !-- Backward fft: spectral to gridpoint transform 2272 2273 istart = 1 2274 ia = istart 2275 i = istart 2276 a(:,i+inc) = 0.5_wp * a(:,i) 2277 IF ( MOD(n,2) /= 1 ) THEN 2278 i = istart + n * inc 2279 a(:,i) = 0.5_wp * a(:,i) 2280 ENDIF 2281 ia = istart + inc 2282 la = 1 2283 igo = + 1 2284 2285 DO k = 1, nfax 2286 2287 ifac = ifax(k+1) 2288 ierr = -1 2289 IF ( igo /= -1 ) THEN 2290 CALL rpassm_vec( a(:,ia:), a(:,ia+la*inc:), work, work(:,ifac*la+1:), trigs, inc, 1, & 2291 n, ifac, la, ierr ) 2292 ELSE 2293 CALL rpassm_vec( work, work(:,la+1:), a(:,ia:), a(:,ia+ifac*la*inc:), trigs, 1, inc, & 2294 n, ifac, la, ierr ) 2295 ENDIF 2296 ! 2297 !-- Following messages shouldn't appear in PALM applications 2298 IF ( ierr /= 0 ) THEN 2299 SELECT CASE (ierr) 2300 CASE (:-1) 2301 WRITE (nout,'(A,I5,A)') ' Vector length =',nvex,', greater than nfft' 2302 CASE (0) 2303 WRITE (nout,'(A,I3,A)') ' Factor =',ifac,', not catered for' 2304 CASE (1:) 2305 WRITE (nout,'(A,I3,A)') ' Factor =',ifac,', only catered for if la*ifac=n' 2306 END SELECT 2307 RETURN 2308 ENDIF 2309 2310 la = ifac * la 2311 igo = -igo 2312 ia = istart 2313 2314 ENDDO 2315 ! 2316 !-- If necessary, copy results back to a 2317 IF ( MOD(nfax,2) /= 0 ) THEN 2318 ibase = 1 2319 jbase = ia 2320 i = ibase 2321 j = jbase 2322 DO ii = 1, n 2323 a(:,j) = work(:,i) 2324 i = i + 1 2325 j = j + inc 2326 ENDDO 2327 ENDIF 2328 ! 2329 !-- Fill in zeros at end 2330 ix = istart + n*inc 2331 a(:,ix) = 0.0_wp 2332 a(:,ix+inc) = 0.0_wp 2333 2334 ELSEIF ( isign == -1 ) THEN 2335 ! 2336 !-- Forward fft: gridpoint to spectral transform 2337 istart = 1 2338 ia = istart 2339 la = n 2340 igo = + 1 2341 2342 DO k = 1, nfax 2343 2344 ifac = ifax(nfax+2-k) 2345 la = la / ifac 2346 ierr = -1 2347 2348 IF ( igo /= -1 ) THEN 2349 CALL qpassm_vec( a(:,ia:), a(:,ia+ifac*la*inc:), work, work(:,la+1:), trigs, inc, 1, & 2350 n, ifac, la, ierr ) 2351 ELSE 2352 CALL qpassm_vec( work, work(:,ifac*la+1:), a(:,ia:), a(:,ia+la*inc:), trigs, 1, inc, & 2353 n, ifac, la, ierr ) 2354 ENDIF 2355 2356 ! 2357 !-- Following messages shouldn't appear in PALM applications 2358 IF ( ierr /= 0 ) THEN 2359 SELECT CASE (ierr) 2360 CASE (0) 2361 WRITE (nout,'(A,I3,A)') ' Factor =',ifac,', not catered for' 2362 CASE (1:) 2363 WRITE (nout,'(A,I3,A)') ' Factor =',ifac,', only catered for if la*ifac=n' 2364 END SELECT 2365 RETURN 2366 ENDIF 2367 2368 igo = -igo 2369 ia = istart + inc 2370 2371 ENDDO 2372 ! 2373 !-- If necessary, copy results back to a 2374 IF ( MOD(nfax,2) /= 0 ) THEN 2375 ibase = 1 2376 jbase = ia 2377 i = ibase 2378 j = jbase 2379 DO ii = 1, n 2380 a(:,j) = work(:,i) 2381 i = i + 1 2382 j = j + inc 2383 ENDDO 2384 ENDIF 2385 ! 2386 !-- Shift a(0) and fill in zero imag parts 2387 ix = istart 2388 a(:,ix) = a(:,ix+inc) 2389 a(:,ix+inc) = 0.0_wp 2390 2391 IF ( MOD(n,2) /= 1 ) THEN 2392 iz = istart + (n+1) * inc 2393 a(:,iz) = 0.0_wp 2394 ENDIF 2395 2396 ENDIF 2397 2398 END SUBROUTINE fft991cy_vec 2399 2400 !------------------------------------------------------------------------------! 2401 ! Description: 2402 ! ------------ 2403 !> Performs one pass through data as part of 2404 !> multiple real fft (fourier analysis) routine. 2405 !> 2406 !> Method: 2407 !> 2408 !> a is first real input vector 2409 !> equivalence b(1) with a(ifac*la*inc1+1) 2410 !> c is first real output vector 2411 !> equivalence d(1) with c(la*inc2+1) 2412 !> trigs is a precalculated list of sines & cosines 2413 !> inc1 is the addressing increment for a 2414 !> inc2 is the addressing increment for c 2415 !> inc3 is the increment between input vectors a 2416 !> inc4 is the increment between output vectors c 2417 !> lot is the number of vectors 2418 !> n is the length of the vectors 2419 !> ifac is the current factor of n 2420 !> la = n/(product of factors used so far) 2421 !> ierr is an error indicator: 2422 !> 0 - pass completed without error 2423 !> 1 - lot greater than nfft 2424 !> 2 - ifac not catered for 2425 !> 3 - ifac only catered for if la=n/ifac 2426 !------------------------------------------------------------------------------! 2427 SUBROUTINE qpassm_vec( a, b, c, d, trigs, inc1, inc2, n, ifac, la, ierr ) 2428 2429 USE kinds 2430 2431 IMPLICIT NONE 2432 2433 INTEGER(iwp),INTENT(IN) :: ifac !< 2434 INTEGER(iwp),INTENT(IN) :: inc1 !< 2435 INTEGER(iwp),INTENT(IN) :: inc2 !< 2436 INTEGER(iwp),INTENT(IN) :: la !< 2437 INTEGER(iwp),INTENT(IN) :: n !< 2438 INTEGER(iwp),INTENT(OUT) :: ierr !< 2439 2440 ! 2441 !-- Arrays are dimensioned with n 2442 REAL(wp),DIMENSION(:,:) :: a !< 2443 REAL(wp),DIMENSION(:,:) :: b !< 2444 REAL(wp),DIMENSION(:,:) :: c !< 2445 REAL(wp),DIMENSION(:,:) :: d !< 2446 REAL(wp),DIMENSION(:),INTENT(IN) :: trigs !< 2447 2448 REAL(wp) :: a0 !< 2449 REAL(wp) :: a1 !< 2450 REAL(wp) :: a10 !< 2451 REAL(wp) :: a11 !< 2452 REAL(wp) :: a2 !< 2453 REAL(wp) :: a20 !< 2454 REAL(wp) :: a21 !< 2455 REAL(wp) :: a3 !< 2456 REAL(wp) :: a4 !< 2457 REAL(wp) :: a5 !< 2458 REAL(wp) :: a6 !< 2459 REAL(wp) :: b0 !< 2460 REAL(wp) :: b1 !< 2461 REAL(wp) :: b10 !< 2462 REAL(wp) :: b11 !< 2463 REAL(wp) :: b2 !< 2464 REAL(wp) :: b20 !< 2465 REAL(wp) :: b21 !< 2466 REAL(wp) :: b3 !< 2467 REAL(wp) :: b4 !< 2468 REAL(wp) :: b5 !< 2469 REAL(wp) :: b6 !< 2470 REAL(wp) :: c1 !< 2471 REAL(wp) :: c2 !< 2472 REAL(wp) :: c3 !< 2473 REAL(wp) :: c4 !< 2474 REAL(wp) :: c5 !< 2475 REAL(wp) :: qrt5 !< 2476 REAL(wp) :: s1 !< 2477 REAL(wp) :: s2 !< 2478 REAL(wp) :: s3 !< 2479 REAL(wp) :: s4 !< 2480 REAL(wp) :: s5 !< 2481 REAL(wp) :: sin36 !< 2482 REAL(wp) :: sin45 !< 2483 REAL(wp) :: sin60 !< 2484 REAL(wp) :: sin72 !< 2485 REAL(wp) :: z !< 2486 REAL(wp) :: zqrt5 !< 2487 REAL(wp) :: zsin36 !< 2488 REAL(wp) :: zsin45 !< 2489 REAL(wp) :: zsin60 !< 2490 REAL(wp) :: zsin72 !< 2491 2492 INTEGER(iwp) :: i !< 2493 INTEGER(iwp) :: ia !< 2494 INTEGER(iwp) :: ib !< 2495 INTEGER(iwp) :: ibase !< 2496 INTEGER(iwp) :: ic !< 2497 INTEGER(iwp) :: id !< 2498 INTEGER(iwp) :: ie !< 2499 INTEGER(iwp) :: if !< 2500 INTEGER(iwp) :: ig !< 2501 INTEGER(iwp) :: igo !< 2502 INTEGER(iwp) :: ih !< 2503 INTEGER(iwp) :: iink !< 2504 INTEGER(iwp) :: ijump !< 2505 INTEGER(iwp) :: ipl !< loop index parallel loop 2506 INTEGER(iwp) :: j !< 2507 INTEGER(iwp) :: ja !< 2508 INTEGER(iwp) :: jb !< 2509 INTEGER(iwp) :: jbase !< 2510 INTEGER(iwp) :: jc !< 2511 INTEGER(iwp) :: jd !< 2512 INTEGER(iwp) :: je !< 2513 INTEGER(iwp) :: jf !< 2514 INTEGER(iwp) :: jink !< 2515 INTEGER(iwp) :: k !< 2516 INTEGER(iwp) :: kb !< 2517 INTEGER(iwp) :: kc !< 2518 INTEGER(iwp) :: kd !< 2519 INTEGER(iwp) :: ke !< 2520 INTEGER(iwp) :: kf !< 2521 INTEGER(iwp) :: kstop !< 2522 INTEGER(iwp) :: l !< 2523 INTEGER(iwp) :: m !< 2524 2525 2526 DATA sin36/0.587785252292473_wp/, sin72/0.951056516295154_wp/, & 2527 qrt5/0.559016994374947_wp/, sin60/0.866025403784437_wp/ 2528 2529 2530 ierr = 0 2531 2532 m = n / ifac 2533 iink = la * inc1 2534 jink = la * inc2 2535 ijump = (ifac-1) * iink 2536 kstop = ( n-ifac ) / ( 2*ifac ) 2537 2538 ibase = 0 2539 jbase = 0 2540 igo = ifac - 1 2541 IF ( igo == 7 ) igo = 6 2542 2543 IF (igo < 1 .OR. igo > 6 ) THEN 2544 ierr = 2 2545 RETURN 2546 ENDIF 2547 2548 2549 SELECT CASE ( igo ) 2550 2551 ! 2552 !-- Coding for factor 2 2553 CASE ( 1 ) 2554 ia = 1 2555 ib = ia + iink 2556 ja = 1 2557 jb = ja + (2*m-la) * inc2 2558 2559 IF ( la /= m ) THEN 2560 2561 DO l = 1, la 2562 i = ibase 2563 j = jbase 2564 c(:,ja+j) = a(:,ia+i) + a(:,ib+i) 2565 c(:,jb+j) = a(:,ia+i) - a(:,ib+i) 2566 ibase = ibase + inc1 2567 jbase = jbase + inc2 2568 ENDDO 2569 2570 ja = ja + jink 2571 jink = 2 * jink 2572 jb = jb - jink 2573 ibase = ibase + ijump 2574 ijump = 2 * ijump + iink 2575 2576 IF ( ja /= jb ) THEN 2577 2578 DO k = la, kstop, la 2579 kb = k + k 2580 c1 = trigs(kb+1) 2581 s1 = trigs(kb+2) 2582 jbase = 0 2583 DO l = 1, la 2584 i = ibase 2585 j = jbase 2586 c(:,ja+j) = a(:,ia+i) + (c1*a(:,ib+i)+s1*b(:,ib+i)) 2587 c(:,jb+j) = a(:,ia+i) - (c1*a(:,ib+i)+s1*b(:,ib+i)) 2588 d(:,ja+j) = (c1*b(:,ib+i)-s1*a(:,ib+i)) + b(:,ia+i) 2589 d(:,jb+j) = (c1*b(:,ib+i)-s1*a(:,ib+i)) - b(:,ia+i) 2590 ibase = ibase + inc1 2591 jbase = jbase + inc2 2592 ENDDO 2593 2594 ibase = ibase + ijump 2595 ja = ja + jink 2596 jb = jb - jink 2597 ENDDO 2598 2599 IF ( ja > jb ) RETURN 2600 2601 ENDIF 2602 2603 jbase = 0 2604 DO l = 1, la 2605 i = ibase 2606 j = jbase 2607 c(:,ja+j) = a(:,ia+i) 2608 d(:,ja+j) = -a(:,ib+i) 2609 ibase = ibase + inc1 2610 jbase = jbase + inc2 2611 ENDDO 2612 2613 ELSE 2614 2615 z = 1.0_wp/REAL(n,KIND=wp) 2616 DO l = 1, la 2617 i = ibase 2618 j = jbase 2619 c(:,ja+j) = z*(a(:,ia+i)+a(:,ib+i)) 2620 c(:,jb+j) = z*(a(:,ia+i)-a(:,ib+i)) 2621 ibase = ibase + inc1 2622 jbase = jbase + inc2 2623 ENDDO 2624 2625 ENDIF 2626 2627 ! 2628 !-- Coding for factor 3 2629 CASE ( 2 ) 2630 2631 ia = 1 2632 ib = ia + iink 2633 ic = ib + iink 2634 ja = 1 2635 jb = ja + (2*m-la) * inc2 2636 jc = jb 2637 2638 IF ( la /= m ) THEN 2639 2640 DO l = 1, la 2641 i = ibase 2642 j = jbase 2643 c(:,ja+j) = a(:,ia+i) + (a(:,ib+i)+a(:,ic+i)) 2644 c(:,jb+j) = a(:,ia+i) - 0.5_wp*(a(:,ib+i)+a(:,ic+i)) 2645 d(:,jb+j) = sin60*(a(:,ic+i)-a(:,ib+i)) 2646 ibase = ibase + inc1 2647 jbase = jbase + inc2 2648 ENDDO 2649 2650 ja = ja + jink 2651 jink = 2 * jink 2652 jb = jb + jink 2653 jc = jc - jink 2654 ibase = ibase + ijump 2655 ijump = 2 * ijump + iink 2656 2657 IF ( ja /= jc ) THEN 2658 2659 DO k = la, kstop, la 2660 kb = k + k 2661 kc = kb + kb 2662 c1 = trigs(kb+1) 2663 s1 = trigs(kb+2) 2664 c2 = trigs(kc+1) 2665 s2 = trigs(kc+2) 2666 2667 jbase = 0 2668 DO l = 1, la 2669 i = ibase 2670 j = jbase 2671 DO ipl = 1, SIZE(a,1) 2672 a1 = (c1*a(ipl,ib+i)+s1*b(ipl,ib+i)) + (c2*a(ipl,ic+i)+s2*b(ipl,ic+i)) 2673 b1 = (c1*b(ipl,ib+i)-s1*a(ipl,ib+i)) + (c2*b(ipl,ic+i)-s2*a(ipl,ic+i)) 2674 a2 = a(ipl,ia+i) - 0.5_wp*a1 2675 b2 = b(ipl,ia+i) - 0.5_wp*b1 2676 a3 = sin60*((c1*a(ipl,ib+i)+s1*b(ipl,ib+i))-(c2*a(ipl,ic+i)+s2*b(ipl,ic+i))) 2677 b3 = sin60*((c1*b(ipl,ib+i)-s1*a(ipl,ib+i))-(c2*b(ipl,ic+i)-s2*a(ipl,ic+i))) 2678 c(ipl,ja+j) = a(ipl,ia+i) + a1 2679 d(ipl,ja+j) = b(ipl,ia+i) + b1 2680 c(ipl,jb+j) = a2 + b3 2681 d(ipl,jb+j) = b2 - a3 2682 c(ipl,jc+j) = a2 - b3 2683 d(ipl,jc+j) = -(b2+a3) 2684 ENDDO 2685 ibase = ibase + inc1 2686 jbase = jbase + inc2 2687 ENDDO 2688 ibase = ibase + ijump 2689 ja = ja + jink 2690 jb = jb + jink 2691 jc = jc - jink 2692 ENDDO 2693 2694 IF ( ja > jc ) RETURN 2695 2696 ENDIF 2697 2698 jbase = 0 2699 DO l = 1, la 2700 i = ibase 2701 j = jbase 2702 c(:,ja+j) = a(:,ia+i) + 0.5_wp*(a(:,ib+i)-a(:,ic+i)) 2703 d(:,ja+j) = -sin60*(a(:,ib+i)+a(:,ic+i)) 2704 c(:,jb+j) = a(:,ia+i) - (a(:,ib+i)-a(:,ic+i)) 2705 ibase = ibase + inc1 2706 jbase = jbase + inc2 2707 ENDDO 2708 2709 ELSE 2710 2711 z = 1.0_wp / REAL( n, KIND=wp ) 2712 zsin60 = z*sin60 2713 DO l = 1, la 2714 i = ibase 2715 j = jbase 2716 c(:,ja+j) = z*(a(:,ia+i)+(a(:,ib+i)+a(:,ic+i))) 2717 c(:,jb+j) = z*(a(:,ia+i)-0.5_wp*(a(:,ib+i)+a(:,ic+i))) 2718 d(:,jb+j) = zsin60*(a(:,ic+i)-a(:,ib+i)) 2719 ibase = ibase + inc1 2720 jbase = jbase + inc2 2721 ENDDO 2722 2723 ENDIF 2724 2725 ! 2726 !-- Coding for factor 4 2727 CASE ( 3 ) 2728 2729 ia = 1 2730 ib = ia + iink 2731 ic = ib + iink 2732 id = ic + iink 2733 ja = 1 2734 jb = ja + (2*m-la) * inc2 2735 jc = jb + 2*m*inc2 2736 jd = jb 2737 2738 IF ( la /= m ) THEN 2739 2740 DO l = 1, la 2741 2742 i = ibase 2743 j = jbase 2744 c(:,ja+j) = (a(:,ia+i)+a(:,ic+i)) + (a(:,ib+i)+a(:,id+i)) 2745 c(:,jc+j) = (a(:,ia+i)+a(:,ic+i)) - (a(:,ib+i)+a(:,id+i)) 2746 c(:,jb+j) = a(:,ia+i) - a(:,ic+i) 2747 d(:,jb+j) = a(:,id+i) - a(:,ib+i) 2748 ibase = ibase + inc1 2749 jbase = jbase + inc2 2750 ENDDO 2751 2752 ja = ja + jink 2753 jink = 2 * jink 2754 jb = jb + jink 2755 jc = jc - jink 2756 jd = jd - jink 2757 ibase = ibase + ijump 2758 ijump = 2 * ijump + iink 2759 2760 IF ( jb /= jc ) THEN 2761 2762 DO k = la, kstop, la 2763 kb = k + k 2764 kc = kb + kb 2765 kd = kc + kb 2766 c1 = trigs(kb+1) 2767 s1 = trigs(kb+2) 2768 c2 = trigs(kc+1) 2769 s2 = trigs(kc+2) 2770 c3 = trigs(kd+1) 2771 s3 = trigs(kd+2) 2772 jbase = 0 2773 DO l = 1, la 2774 i = ibase 2775 j = jbase 2776 DO ipl = 1, SIZE(a,1) 2777 a0 = a(ipl,ia+i) + (c2*a(ipl,ic+i)+s2*b(ipl,ic+i)) 2778 a2 = a(ipl,ia+i) - (c2*a(ipl,ic+i)+s2*b(ipl,ic+i)) 2779 a1 = (c1*a(ipl,ib+i)+s1*b(ipl,ib+i)) + (c3*a(ipl,id+i)+s3*b(ipl,id+i)) 2780 a3 = (c1*a(ipl,ib+i)+s1*b(ipl,ib+i)) - (c3*a(ipl,id+i)+s3*b(ipl,id+i)) 2781 b0 = b(ipl,ia+i) + (c2*b(ipl,ic+i)-s2*a(ipl,ic+i)) 2782 b2 = b(ipl,ia+i) - (c2*b(ipl,ic+i)-s2*a(ipl,ic+i)) 2783 b1 = (c1*b(ipl,ib+i)-s1*a(ipl,ib+i)) + (c3*b(ipl,id+i)-s3*a(ipl,id+i)) 2784 b3 = (c1*b(ipl,ib+i)-s1*a(ipl,ib+i)) - (c3*b(ipl,id+i)-s3*a(ipl,id+i)) 2785 c(ipl,ja+j) = a0 + a1 2786 c(ipl,jc+j) = a0 - a1 2787 d(ipl,ja+j) = b0 + b1 2788 d(ipl,jc+j) = b1 - b0 2789 c(ipl,jb+j) = a2 + b3 2790 c(ipl,jd+j) = a2 - b3 2791 d(ipl,jb+j) = b2 - a3 2792 d(ipl,jd+j) = -(b2+a3) 2793 ENDDO 2794 ibase = ibase + inc1 2795 jbase = jbase + inc2 2796 ENDDO 2797 ibase = ibase + ijump 2798 ja = ja + jink 2799 jb = jb + jink 2800 jc = jc - jink 2801 jd = jd - jink 2802 ENDDO 2803 2804 IF ( jb > jc ) RETURN 2805 2806 ENDIF 2807 2808 sin45 = SQRT( 0.5_wp ) 2809 jbase = 0 2810 DO l = 1, la 2811 i = ibase 2812 j = jbase 2813 c(:,ja+j) = a(:,ia+i) + sin45*(a(:,ib+i)-a(:,id+i)) 2814 c(:,jb+j) = a(:,ia+i) - sin45*(a(:,ib+i)-a(:,id+i)) 2815 d(:,ja+j) = -a(:,ic+i) - sin45*(a(:,ib+i)+a(:,id+i)) 2816 d(:,jb+j) = a(:,ic+i) - sin45*(a(:,ib+i)+a(:,id+i)) 2817 ibase = ibase + inc1 2818 jbase = jbase + inc2 2819 ENDDO 2820 2821 ELSE 2822 2823 z = 1.0_wp / REAL( n, KIND=wp ) 2824 DO l = 1, la 2825 i = ibase 2826 j = jbase 2827 c(:,ja+j) = z*((a(:,ia+i)+a(:,ic+i))+(a(:,ib+i)+a(:,id+i))) 2828 c(:,jc+j) = z*((a(:,ia+i)+a(:,ic+i))-(a(:,ib+i)+a(:,id+i))) 2829 c(:,jb+j) = z*(a(:,ia+i)-a(:,ic+i)) 2830 d(:,jb+j) = z*(a(:,id+i)-a(:,ib+i)) 2831 ibase = ibase + inc1 2832 jbase = jbase + inc2 2833 ENDDO 2834 2835 ENDIF 2836 2837 ! 2838 !-- Coding for factor 5 2839 CASE ( 4 ) 2840 2841 ia = 1 2842 ib = ia + iink 2843 ic = ib + iink 2844 id = ic + iink 2845 ie = id + iink 2846 ja = 1 2847 jb = ja + (2*m-la) * inc2 2848 jc = jb + 2*m*inc2 2849 jd = jc 2850 je = jb 2851 2852 IF ( la /= m ) THEN 2853 2854 DO l = 1, la 2855 i = ibase 2856 j = jbase 2857 DO ipl = 1, SIZE(a,1) 2858 a1 = a(ipl,ib+i) + a(ipl,ie+i) 2859 a3 = a(ipl,ib+i) - a(ipl,ie+i) 2860 a2 = a(ipl,ic+i) + a(ipl,id+i) 2861 a4 = a(ipl,ic+i) - a(ipl,id+i) 2862 a5 = a(ipl,ia+i) - 0.25_wp*(a1+a2) 2863 a6 = qrt5*(a1-a2) 2864 c(ipl,ja+j) = a(ipl,ia+i) + (a1+a2) 2865 c(ipl,jb+j) = a5 + a6 2866 c(ipl,jc+j) = a5 - a6 2867 d(ipl,jb+j) = -sin72*a3 - sin36*a4 2868 d(ipl,jc+j) = -sin36*a3 + sin72*a4 2869 ENDDO 2870 ibase = ibase + inc1 2871 jbase = jbase + inc2 2872 ENDDO 2873 2874 ja = ja + jink 2875 jink = 2 * jink 2876 jb = jb + jink 2877 jc = jc + jink 2878 jd = jd - jink 2879 je = je - jink 2880 ibase = ibase + ijump 2881 ijump = 2 * ijump + iink 2882 2883 IF ( jb /= jd ) THEN 2884 2885 DO k = la, kstop, la 2886 kb = k + k 2887 kc = kb + kb 2888 kd = kc + kb 2889 ke = kd + kb 2890 c1 = trigs(kb+1) 2891 s1 = trigs(kb+2) 2892 c2 = trigs(kc+1) 2893 s2 = trigs(kc+2) 2894 c3 = trigs(kd+1) 2895 s3 = trigs(kd+2) 2896 c4 = trigs(ke+1) 2897 s4 = trigs(ke+2) 2898 jbase = 0 2899 DO l = 1, la 2900 i = ibase 2901 j = jbase 2902 DO ipl = 1, SIZE(a,1) 2903 a1 = (c1*a(ipl,ib+i)+s1*b(ipl,ib+i)) + (c4*a(ipl,ie+i)+s4*b(ipl,ie+i)) 2904 a3 = (c1*a(ipl,ib+i)+s1*b(ipl,ib+i)) - (c4*a(ipl,ie+i)+s4*b(ipl,ie+i)) 2905 a2 = (c2*a(ipl,ic+i)+s2*b(ipl,ic+i)) + (c3*a(ipl,id+i)+s3*b(ipl,id+i)) 2906 a4 = (c2*a(ipl,ic+i)+s2*b(ipl,ic+i)) - (c3*a(ipl,id+i)+s3*b(ipl,id+i)) 2907 b1 = (c1*b(ipl,ib+i)-s1*a(ipl,ib+i)) + (c4*b(ipl,ie+i)-s4*a(ipl,ie+i)) 2908 b3 = (c1*b(ipl,ib+i)-s1*a(ipl,ib+i)) - (c4*b(ipl,ie+i)-s4*a(ipl,ie+i)) 2909 b2 = (c2*b(ipl,ic+i)-s2*a(ipl,ic+i)) + (c3*b(ipl,id+i)-s3*a(ipl,id+i)) 2910 b4 = (c2*b(ipl,ic+i)-s2*a(ipl,ic+i)) - (c3*b(ipl,id+i)-s3*a(ipl,id+i)) 2911 a5 = a(ipl,ia+i) - 0.25_wp*(a1+a2) 2912 a6 = qrt5*(a1-a2) 2913 b5 = b(ipl,ia+i) - 0.25_wp*(b1+b2) 2914 b6 = qrt5*(b1-b2) 2915 a10 = a5 + a6 2916 a20 = a5 - a6 2917 b10 = b5 + b6 2918 b20 = b5 - b6 2919 a11 = sin72*b3 + sin36*b4 2920 a21 = sin36*b3 - sin72*b4 2921 b11 = sin72*a3 + sin36*a4 2922 b21 = sin36*a3 - sin72*a4 2923 c(ipl,ja+j) = a(ipl,ia+i) + (a1+a2) 2924 c(ipl,jb+j) = a10 + a11 2925 c(ipl,je+j) = a10 - a11 2926 c(ipl,jc+j) = a20 + a21 2927 c(ipl,jd+j) = a20 - a21 2928 d(ipl,ja+j) = b(ipl,ia+i) + (b1+b2) 2929 d(ipl,jb+j) = b10 - b11 2930 d(ipl,je+j) = -(b10+b11) 2931 d(ipl,jc+j) = b20 - b21 2932 d(ipl,jd+j) = -(b20+b21) 2933 ENDDO 2934 ibase = ibase + inc1 2935 jbase = jbase + inc2 2936 ENDDO 2937 ibase = ibase + ijump 2938 ja = ja + jink 2939 jb = jb + jink 2940 jc = jc + jink 2941 jd = jd - jink 2942 je = je - jink 2943 ENDDO 2944 2945 IF ( jb > jd ) RETURN 2946 2947 ENDIF 2948 2949 jbase = 0 2950 DO l = 1, la 2951 i = ibase 2952 j = jbase 2953 DO ipl = 1, SIZE(a,1) 2954 a1 = a(ipl,ib+i) + a(ipl,ie+i) 2955 a3 = a(ipl,ib+i) - a(ipl,ie+i) 2956 a2 = a(ipl,ic+i) + a(ipl,id+i) 2957 a4 = a(ipl,ic+i) - a(ipl,id+i) 2958 a5 = a(ipl,ia+i) + 0.25_wp*(a3-a4) 2959 a6 = qrt5*(a3+a4) 2960 c(ipl,ja+j) = a5 + a6 2961 c(ipl,jb+j) = a5 - a6 2962 c(ipl,jc+j) = a(ipl,ia+i) - (a3-a4) 2963 d(ipl,ja+j) = -sin36*a1 - sin72*a2 2964 d(ipl,jb+j) = -sin72*a1 + sin36*a2 2965 ENDDO 2966 ibase = ibase + inc1 2967 jbase = jbase + inc2 2968 ENDDO 2969 2970 ELSE 2971 2972 z = 1.0_wp / REAL( n, KIND=wp ) 2973 zqrt5 = z * qrt5 2974 zsin36 = z * sin36 2975 zsin72 = z * sin72 2976 DO l = 1, la 2977 i = ibase 2978 j = jbase 2979 DO ipl = 1, SIZE(a,1) 2980 a1 = a(ipl,ib+i) + a(ipl,ie+i) 2981 a3 = a(ipl,ib+i) - a(ipl,ie+i) 2982 a2 = a(ipl,ic+i) + a(ipl,id+i) 2983 a4 = a(ipl,ic+i) - a(ipl,id+i) 2984 a5 = z*(a(ipl,ia+i)-0.25_wp*(a1+a2)) 2985 a6 = zqrt5*(a1-a2) 2986 c(ipl,ja+j) = z*(a(ipl,ia+i)+(a1+a2)) 2987 c(ipl,jb+j) = a5 + a6 2988 c(ipl,jc+j) = a5 - a6 2989 d(ipl,jb+j) = -zsin72*a3 - zsin36*a4 2990 d(ipl,jc+j) = -zsin36*a3 + zsin72*a4 2991 ENDDO 2992 ibase = ibase + inc1 2993 jbase = jbase + inc2 2994 ENDDO 2995 2996 ENDIF 2997 2998 ! 2999 !-- Coding for factor 6 3000 CASE ( 5 ) 3001 3002 ia = 1 3003 ib = ia + iink 3004 ic = ib + iink 3005 id = ic + iink 3006 ie = id + iink 3007 if = ie + iink 3008 ja = 1 3009 jb = ja + (2*m-la) * inc2 3010 jc = jb + 2*m*inc2 3011 jd = jc + 2*m*inc2 3012 je = jc 3013 jf = jb 3014 3015 IF ( la /= m ) THEN 3016 3017 DO l = 1, la 3018 i = ibase 3019 j = jbase 3020 DO ipl = 1, SIZE(a,1) 3021 a11 = (a(ipl,ic+i)+a(ipl,if+i)) + (a(ipl,ib+i)+a(ipl,ie+i)) 3022 c(ipl,ja+j) = (a(ipl,ia+i)+a(ipl,id+i)) + a11 3023 c(ipl,jc+j) = (a(ipl,ia+i)+a(ipl,id+i)-0.5_wp*a11) 3024 d(ipl,jc+j) = sin60*((a(ipl,ic+i)+a(ipl,if+i))-(a(ipl,ib+i)+a(ipl,ie+i))) 3025 a11 = (a(ipl,ic+i)-a(ipl,if+i)) + (a(ipl,ie+i)-a(ipl,ib+i)) 3026 c(ipl,jb+j) = (a(ipl,ia+i)-a(ipl,id+i)) - 0.5_wp*a11 3027 d(ipl,jb+j) = sin60*((a(ipl,ie+i)-a(ipl,ib+i))-(a(ipl,ic+i)-a(ipl,if+i))) 3028 c(ipl,jd+j) = (a(ipl,ia+i)-a(ipl,id+i)) + a11 3029 END DO 3030 ibase = ibase + inc1 3031 jbase = jbase + inc2 3032 ENDDO 3033 ja = ja + jink 3034 jink = 2 * jink 3035 jb = jb + jink 3036 jc = jc + jink 3037 jd = jd - jink 3038 je = je - jink 3039 jf = jf - jink 3040 ibase = ibase + ijump 3041 ijump = 2 * ijump + iink 3042 3043 IF ( jc /= jd ) THEN 3044 3045 DO k = la, kstop, la 3046 kb = k + k 3047 kc = kb + kb 3048 kd = kc + kb 3049 ke = kd + kb 3050 kf = ke + kb 3051 c1 = trigs(kb+1) 3052 s1 = trigs(kb+2) 3053 c2 = trigs(kc+1) 3054 s2 = trigs(kc+2) 3055 c3 = trigs(kd+1) 3056 s3 = trigs(kd+2) 3057 c4 = trigs(ke+1) 3058 s4 = trigs(ke+2) 3059 c5 = trigs(kf+1) 3060 s5 = trigs(kf+2) 3061 jbase = 0 3062 DO l = 1, la 3063 i = ibase 3064 j = jbase 3065 DO ipl = 1, SIZE(a,1) 3066 a1 = c1*a(ipl,ib+i) + s1*b(ipl,ib+i) 3067 b1 = c1*b(ipl,ib+i) - s1*a(ipl,ib+i) 3068 a2 = c2*a(ipl,ic+i) + s2*b(ipl,ic+i) 3069 b2 = c2*b(ipl,ic+i) - s2*a(ipl,ic+i) 3070 a3 = c3*a(ipl,id+i) + s3*b(ipl,id+i) 3071 b3 = c3*b(ipl,id+i) - s3*a(ipl,id+i) 3072 a4 = c4*a(ipl,ie+i) + s4*b(ipl,ie+i) 3073 b4 = c4*b(ipl,ie+i) - s4*a(ipl,ie+i) 3074 a5 = c5*a(ipl,if+i) + s5*b(ipl,if+i) 3075 b5 = c5*b(ipl,if+i) - s5*a(ipl,if+i) 3076 a11 = (a2+a5) + (a1+a4) 3077 a20 = (a(ipl,ia+i)+a3) - 0.5_wp*a11 3078 a21 = sin60*((a2+a5)-(a1+a4)) 3079 b11 = (b2+b5) + (b1+b4) 3080 b20 = (b(ipl,ia+i)+b3) - 0.5_wp*b11 3081 b21 = sin60*((b2+b5)-(b1+b4)) 3082 c(ipl,ja+j) = (a(ipl,ia+i)+a3) + a11 3083 d(ipl,ja+j) = (b(ipl,ia+i)+b3) + b11 3084 c(ipl,jc+j) = a20 - b21 3085 d(ipl,jc+j) = a21 + b20 3086 c(ipl,je+j) = a20 + b21 3087 d(ipl,je+j) = a21 - b20 3088 a11 = (a2-a5) + (a4-a1) 3089 a20 = (a(ipl,ia+i)-a3) - 0.5_wp*a11 3090 a21 = sin60*((a4-a1)-(a2-a5)) 3091 b11 = (b5-b2) - (b4-b1) 3092 b20 = (b3-b(ipl,ia+i)) - 0.5_wp*b11 3093 b21 = sin60*((b5-b2)+(b4-b1)) 3094 c(ipl,jb+j) = a20 - b21 3095 d(ipl,jb+j) = a21 - b20 3096 c(ipl,jd+j) = a11 + (a(ipl,ia+i)-a3) 3097 d(ipl,jd+j) = b11 + (b3-b(ipl,ia+i)) 3098 c(ipl,jf+j) = a20 + b21 3099 d(ipl,jf+j) = a21 + b20 3100 ENDDO 3101 ibase = ibase + inc1 3102 jbase = jbase + inc2 3103 ENDDO 3104 ibase = ibase + ijump 3105 ja = ja + jink 3106 jb = jb + jink 3107 jc = jc + jink 3108 jd = jd - jink 3109 je = je - jink 3110 jf = jf - jink 3111 ENDDO 3112 3113 IF ( jc > jd ) RETURN 3114 3115 ENDIF 3116 3117 jbase = 0 3118 DO l = 1, la 3119 i = ibase 3120 j = jbase 3121 c(:,ja+j) = (a(:,ia+i)+0.5_wp*(a(:,ic+i)-a(:,ie+i))) + sin60*(a(:,ib+i)-a(:,if+i)) 3122 d(:,ja+j) = -(a(:,id+i)+0.5_wp*(a(:,ib+i)+a(:,if+i))) - sin60*(a(:,ic+i)+a(:,ie+i)) 3123 c(:,jb+j) = a(:,ia+i) - (a(:,ic+i)-a(:,ie+i)) 3124 d(:,jb+j) = a(:,id+i) - (a(:,ib+i)+a(:,if+i)) 3125 c(:,jc+j) = (a(:,ia+i)+0.5_wp*(a(:,ic+i)-a(:,ie+i))) - sin60*(a(:,ib+i)-a(:,if+i)) 3126 d(:,jc+j) = -(a(:,id+i)+0.5_wp*(a(:,ib+i)+a(:,if+i))) + sin60*(a(:,ic+i)+a(:,ie+i)) 3127 ibase = ibase + inc1 3128 jbase = jbase + inc2 3129 ENDDO 3130 3131 ELSE 3132 3133 z = 1.0_wp/REAL(n,KIND=wp) 3134 zsin60 = z*sin60 3135 DO l = 1, la 3136 i = ibase 3137 j = jbase 3138 DO ipl = 1, SIZE(a,1) 3139 a11 = (a(ipl,ic+i)+a(ipl,if+i)) + (a(ipl,ib+i)+a(ipl,ie+i)) 3140 c(ipl,ja+j) = z*((a(ipl,ia+i)+a(ipl,id+i))+a11) 3141 c(ipl,jc+j) = z*((a(ipl,ia+i)+a(ipl,id+i))-0.5_wp*a11) 3142 d(ipl,jc+j) = zsin60*((a(ipl,ic+i)+a(ipl,if+i))-(a(ipl,ib+i)+a(ipl,ie+i))) 3143 a11 = (a(ipl,ic+i)-a(ipl,if+i)) + (a(ipl,ie+i)-a(ipl,ib+i)) 3144 c(ipl,jb+j) = z*((a(ipl,ia+i)-a(ipl,id+i))-0.5_wp*a11) 3145 d(ipl,jb+j) = zsin60*((a(ipl,ie+i)-a(ipl,ib+i))-(a(ipl,ic+i)-a(ipl,if+i))) 3146 c(ipl,jd+j) = z*((a(ipl,ia+i)-a(ipl,id+i))+a11) 3147 ENDDO 3148 ibase = ibase + inc1 3149 jbase = jbase + inc2 3150 ENDDO 3151 3152 ENDIF 3153 3154 ! 3155 !-- Coding for factor 8 3156 CASE ( 6 ) 3157 3158 IF ( la /= m ) THEN 3159 ierr = 3 3160 RETURN 3161 ENDIF 3162 3163 ia = 1 3164 ib = ia + iink 3165 ic = ib + iink 3166 id = ic + iink 3167 ie = id + iink 3168 if = ie + iink 3169 ig = if + iink 3170 ih = ig + iink 3171 ja = 1 3172 jb = ja + la * inc2 3173 jc = jb + 2*m*inc2 3174 jd = jc + 2*m*inc2 3175 je = jd + 2*m*inc2 3176 z = 1.0_wp / REAL( n, KIND=wp ) 3177 zsin45 = z * SQRT( 0.5_wp ) 3178 3179 DO l = 1, la 3180 i = ibase 3181 j = jbase 3182 c(:,ja+j) = z*(((a(:,ia+i)+a(:,ie+i))+(a(:,ic+i)+a(:,ig+i)))+((a(:,id+i)+ a(:,ih+i))+(a(:,ib+i)+a(:,if+i)))) 3183 c(:,je+j) = z*(((a(:,ia+i)+a(:,ie+i))+(a(:,ic+i)+a(:,ig+i)))-((a(:,id+i)+ a(:,ih+i))+(a(:,ib+i)+a(:,if+i)))) 3184 c(:,jc+j) = z*((a(:,ia+i)+a(:,ie+i))-(a(:,ic+i)+a(:,ig+i))) 3185 d(:,jc+j) = z*((a(:,id+i)+a(:,ih+i))-(a(:,ib+i)+a(:,if+i))) 3186 c(:,jb+j) = z*(a(:,ia+i)-a(:,ie+i)) + zsin45*((a(:,ih+i)-a(:,id+i))-(a(:,if+i)-a(:,ib+i))) 3187 c(:,jd+j) = z*(a(:,ia+i)-a(:,ie+i)) - zsin45*((a(:,ih+i)-a(:,id+i))-(a(:,if+i)-a(:,ib+i))) 3188 d(:,jb+j) = zsin45*((a(:,ih+i)-a(:,id+i))+(a(:,if+i)-a(:,ib+i))) + z*(a(:,ig+i)-a(:,ic+i)) 3189 d(:,jd+j) = zsin45*((a(:,ih+i)-a(:,id+i))+(a(:,if+i)-a(:,ib+i))) - z*(a(:,ig+i)-a(:,ic+i)) 3190 ibase = ibase + inc1 3191 jbase = jbase + inc2 3192 ENDDO 3193 3194 END SELECT 3195 3196 END SUBROUTINE qpassm_vec 3197 3198 !------------------------------------------------------------------------------! 3199 ! Description: 3200 ! ------------ 3201 !> Same as qpassm, but for backward fft 3202 !------------------------------------------------------------------------------! 3203 SUBROUTINE rpassm_vec(a, b, c, d, trigs, inc1, inc2, n, ifac, la, ierr ) 3204 3205 USE kinds 3206 3207 IMPLICIT NONE 3208 3209 INTEGER(iwp) :: ierr !< 3210 INTEGER(iwp) :: ifac !< 3211 INTEGER(iwp) :: inc1 !< 3212 INTEGER(iwp) :: inc2 !< 3213 INTEGER(iwp) :: la !< 3214 INTEGER(iwp) :: n !< 3215 ! 3216 !-- Arrays are dimensioned with n 3217 REAL(wp),DIMENSION(:,:) :: a !< 3218 REAL(wp),DIMENSION(:,:) :: b !< 3219 REAL(wp),DIMENSION(:,:) :: c !< 3220 REAL(wp),DIMENSION(:,:) :: d !< 3221 REAL(wp),DIMENSION(:),INTENT(IN) :: trigs !< 3222 3223 REAL(wp) :: c1 !< 3224 REAL(wp) :: c2 !< 3225 REAL(wp) :: c3 !< 3226 REAL(wp) :: c4 !< 3227 REAL(wp) :: c5 !< 3228 REAL(wp) :: qqrt5 !< 3229 REAL(wp) :: qrt5 !< 3230 REAL(wp) :: s1 !< 3231 REAL(wp) :: s2 !< 3232 REAL(wp) :: s3 !< 3233 REAL(wp) :: s4 !< 3234 REAL(wp) :: s5 !< 3235 REAL(wp) :: sin36 !< 3236 REAL(wp) :: sin45 !< 3237 REAL(wp) :: sin60 !< 3238 REAL(wp) :: sin72 !< 3239 REAL(wp) :: ssin36 !< 3240 REAL(wp) :: ssin45 !< 3241 REAL(wp) :: ssin60 !< 3242 REAL(wp) :: ssin72 !< 3243 3244 INTEGER(iwp) :: i !< 3245 INTEGER(iwp) :: ia !< 3246 INTEGER(iwp) :: ib !< 3247 INTEGER(iwp) :: ibase !< 3248 INTEGER(iwp) :: ic !< 3249 INTEGER(iwp) :: id !< 3250 INTEGER(iwp) :: ie !< 3251 INTEGER(iwp) :: if !< 3252 INTEGER(iwp) :: igo !< 3253 INTEGER(iwp) :: iink !< 3254 INTEGER(iwp) :: ipl !< loop index parallel loop 3255 INTEGER(iwp) :: j !< 3256 INTEGER(iwp) :: ja !< 3257 INTEGER(iwp) :: jb !< 3258 INTEGER(iwp) :: jbase !< 3259 INTEGER(iwp) :: jc !< 3260 INTEGER(iwp) :: jd !< 3261 INTEGER(iwp) :: je !< 3262 INTEGER(iwp) :: jf !< 3263 INTEGER(iwp) :: jg !< 3264 INTEGER(iwp) :: jh !< 3265 INTEGER(iwp) :: jink !< 3266 INTEGER(iwp) :: jump !< 3267 INTEGER(iwp) :: k !< 3268 INTEGER(iwp) :: kb !< 3269 INTEGER(iwp) :: kc !< 3270 INTEGER(iwp) :: kd !< 3271 INTEGER(iwp) :: ke !< 3272 INTEGER(iwp) :: kf !< 3273 INTEGER(iwp) :: kstop !< 3274 INTEGER(iwp) :: l !< 3275 INTEGER(iwp) :: m !< 3276 3277 REAL(wp) :: a10 !< 3278 REAL(wp) :: a11 !< 3279 REAL(wp) :: a20 !< 3280 REAL(wp) :: a21 !< 3281 REAL(wp) :: b10 !< 3282 REAL(wp) :: b11 !< 3283 REAL(wp) :: b20 !< 3284 REAL(wp) :: b21 !< 3285 3286 3287 DATA sin36/0.587785252292473_wp/, sin72/0.951056516295154_wp/, & 3288 qrt5/0.559016994374947_wp/, sin60/0.866025403784437_wp/ 3289 3290 3291 ierr = 0 3292 3293 m = n / ifac 3294 iink = la * inc1 3295 jink = la * inc2 3296 jump = (ifac-1) * jink 3297 kstop = (n-ifac) / (2*ifac) 3298 3299 ibase = 0 3300 jbase = 0 3301 igo = ifac - 1 3302 IF ( igo == 7 ) igo = 6 3303 IF ( igo < 1 .OR. igo > 6 ) THEN 3304 ierr = 2 3305 RETURN 3306 ENDIF 3307 3308 3309 SELECT CASE ( igo ) 3310 ! 3311 !-- Coding for factor 2 3312 CASE ( 1 ) 3313 3314 ia = 1 3315 ib = ia + (2*m-la) * inc1 3316 ja = 1 3317 jb = ja + jink 3318 3319 IF ( la /= m ) THEN 3320 3321 DO l = 1, la 3322 i = ibase 3323 j = jbase 3324 c(:,ja+j) = a(:,ia+i) + a(:,ib+i) 3325 c(:,jb+j) = a(:,ia+i) - a(:,ib+i) 3326 ibase = ibase + inc1 3327 jbase = jbase + inc2 3328 ENDDO 3329 3330 ia = ia + iink 3331 iink = 2 * iink 3332 ib = ib - iink 3333 ibase = 0 3334 jbase = jbase + jump 3335 jump = 2 * jump + jink 3336 3337 IF ( ia /= ib ) THEN 3338 3339 DO k = la, kstop, la 3340 kb = k + k 3341 c1 = trigs(kb+1) 3342 s1 = trigs(kb+2) 3343 ibase = 0 3344 DO l = 1, la 3345 i = ibase 3346 j = jbase 3347 c(:,ja+j) = a(:,ia+i) + a(:,ib+i) 3348 d(:,ja+j) = b(:,ia+i) - b(:,ib+i) 3349 c(:,jb+j) = c1*(a(:,ia+i)-a(:,ib+i)) - s1*(b(:,ia+i)+b(:,ib+i)) 3350 d(:,jb+j) = s1*(a(:,ia+i)-a(:,ib+i)) + c1*(b(:,ia+i)+b(:,ib+i)) 3351 ibase = ibase + inc1 3352 jbase = jbase + inc2 3353 ENDDO 3354 ia = ia + iink 3355 ib = ib - iink 3356 jbase = jbase + jump 3357 ENDDO 3358 3359 IF ( ia > ib ) RETURN 3360 3361 ENDIF 3362 3363 ibase = 0 3364 DO l = 1, la 3365 i = ibase 3366 j = jbase 3367 c(:,ja+j) = a(:,ia+i) 3368 c(:,jb+j) = -b(:,ia+i) 3369 ibase = ibase + inc1 3370 jbase = jbase + inc2 3371 ENDDO 3372 3373 ELSE 3374 3375 DO l = 1, la 3376 i = ibase 3377 j = jbase 3378 c(:,ja+j) = 2.0_wp*(a(:,ia+i)+a(:,ib+i)) 3379 c(:,jb+j) = 2.0_wp*(a(:,ia+i)-a(:,ib+i)) 3380 ibase = ibase + inc1 3381 jbase = jbase + inc2 3382 ENDDO 3383 3384 ENDIF 3385 3386 ! 3387 !-- Coding for factor 3 3388 CASE ( 2 ) 3389 3390 ia = 1 3391 ib = ia + (2*m-la) * inc1 3392 ic = ib 3393 ja = 1 3394 jb = ja + jink 3395 jc = jb + jink 3396 3397 IF ( la /= m ) THEN 3398 3399 DO l = 1, la 3400 i = ibase 3401 j = jbase 3402 c(:,ja+j) = a(:,ia+i) + a(:,ib+i) 3403 c(:,jb+j) = (a(:,ia+i)-0.5_wp*a(:,ib+i)) - (sin60*(b(:,ib+i))) 3404 c(:,jc+j) = (a(:,ia+i)-0.5_wp*a(:,ib+i)) + (sin60*(b(:,ib+i))) 3405 ibase = ibase + inc1 3406 jbase = jbase + inc2 3407 ENDDO 3408 ia = ia + iink 3409 iink = 2 * iink 3410 ib = ib + iink 3411 ic = ic - iink 3412 jbase = jbase + jump 3413 jump = 2 * jump + jink 3414 3415 IF ( ia /= ic ) THEN 3416 3417 DO k = la, kstop, la 3418 kb = k + k 3419 kc = kb + kb 3420 c1 = trigs(kb+1) 3421 s1 = trigs(kb+2) 3422 c2 = trigs(kc+1) 3423 s2 = trigs(kc+2) 3424 ibase = 0 3425 DO l = 1, la 3426 i = ibase 3427 j = jbase 3428 c(:,ja+j) = a(:,ia+i) + (a(:,ib+i)+a(:,ic+i)) 3429 d(:,ja+j) = b(:,ia+i) + (b(:,ib+i)-b(:,ic+i)) 3430 c(:,jb+j) = c1*((a(:,ia+i)-0.5_wp*(a(:,ib+i)+a(:,ic+i)))-(sin60*(b(:,ib+i)+ b(:,ic+i)))) & 3431 - s1*((b(:,ia+i)-0.5_wp*(b(:,ib+i)-b(:,ic+i)))+(sin60*(a(:,ib+i)- a(:,ic+i)))) 3432 d(:,jb+j) = s1*((a(:,ia+i)-0.5_wp*(a(:,ib+i)+a(:,ic+i)))-(sin60*(b(:,ib+i)+ b(:,ic+i)))) & 3433 + c1*((b(:,ia+i)-0.5_wp*(b(:,ib+i)-b(:,ic+i)))+(sin60*(a(:,ib+i)- a(:,ic+i)))) 3434 c(:,jc+j) = c2*((a(:,ia+i)-0.5_wp*(a(:,ib+i)+a(:,ic+i)))+(sin60*(b(:,ib+i)+ b(:,ic+i)))) & 3435 - s2*((b(:,ia+i)-0.5_wp*(b(:,ib+i)-b(:,ic+i)))-(sin60*(a(:,ib+i)- a(:,ic+i)))) 3436 d(:,jc+j) = s2*((a(:,ia+i)-0.5_wp*(a(:,ib+i)+a(:,ic+i)))+(sin60*(b(:,ib+i)+ b(:,ic+i)))) & 3437 + c2*((b(:,ia+i)-0.5_wp*(b(:,ib+i)-b(:,ic+i)))-(sin60*(a(:,ib+i)- a(:,ic+i)))) 3438 ibase = ibase + inc1 3439 jbase = jbase + inc2 3440 ENDDO 3441 ia = ia + iink 3442 ib = ib + iink 3443 ic = ic - iink 3444 jbase = jbase + jump 3445 ENDDO 3446 3447 IF ( ia > ic ) RETURN 3448 3449 ENDIF 3450 3451 ibase = 0 3452 DO l = 1, la 3453 i = ibase 3454 j = jbase 3455 c(:,ja+j) = a(:,ia+i) + a(:,ib+i) 3456 c(:,jb+j) = (0.5_wp*a(:,ia+i)-a(:,ib+i)) - (sin60*b(:,ia+i)) 3457 c(:,jc+j) = -(0.5_wp*a(:,ia+i)-a(:,ib+i)) - (sin60*b(:,ia+i)) 3458 ibase = ibase + inc1 3459 jbase = jbase + inc2 3460 ENDDO 3461 3462 ELSE 3463 3464 ssin60 = 2.0_wp * sin60 3465 DO l = 1, la 3466 i = ibase 3467 j = jbase 3468 c(:,ja+j) = 2.0_wp*(a(:,ia+i)+a(:,ib+i)) 3469 c(:,jb+j) = (2.0_wp*a(:,ia+i)-a(:,ib+i)) - (ssin60*b(:,ib+i)) 3470 c(:,jc+j) = (2.0_wp*a(:,ia+i)-a(:,ib+i)) + (ssin60*b(:,ib+i)) 3471 ibase = ibase + inc1 3472 jbase = jbase + inc2 3473 ENDDO 3474 3475 ENDIF 3476 3477 ! 3478 !-- Coding for factor 4 3479 CASE ( 3 ) 3480 3481 ia = 1 3482 ib = ia + (2*m-la) * inc1 3483 ic = ib + 2*m*inc1 3484 id = ib 3485 ja = 1 3486 jb = ja + jink 3487 jc = jb + jink 3488 jd = jc + jink 3489 3490 IF ( la /= m ) THEN 3491 3492 DO l = 1, la 3493 i = ibase 3494 j = jbase 3495 c(:,ja+j) = (a(:,ia+i)+a(:,ic+i)) + a(:,ib+i) 3496 c(:,jb+j) = (a(:,ia+i)-a(:,ic+i)) - b(:,ib+i) 3497 c(:,jc+j) = (a(:,ia+i)+a(:,ic+i)) - a(:,ib+i) 3498 c(:,jd+j) = (a(:,ia+i)-a(:,ic+i)) + b(:,ib+i) 3499 ibase = ibase + inc1 3500 jbase = jbase + inc2 3501 ENDDO 3502 ia = ia + iink 3503 iink = 2 * iink 3504 ib = ib + iink 3505 ic = ic - iink 3506 id = id - iink 3507 jbase = jbase + jump 3508 jump = 2 * jump + jink 3509 3510 IF ( ib /= ic ) THEN 3511 3512 DO k = la, kstop, la 3513 kb = k + k 3514 kc = kb + kb 3515 kd = kc + kb 3516 c1 = trigs(kb+1) 3517 s1 = trigs(kb+2) 3518 c2 = trigs(kc+1) 3519 s2 = trigs(kc+2) 3520 c3 = trigs(kd+1) 3521 s3 = trigs(kd+2) 3522 ibase = 0 3523 DO l = 1, la 3524 i = ibase 3525 j = jbase 3526 c(:,ja+j) = (a(:,ia+i)+a(:,ic+i)) + (a(:,ib+i)+a(:,id+i)) 3527 d(:,ja+j) = (b(:,ia+i)-b(:,ic+i)) + (b(:,ib+i)-b(:,id+i)) 3528 c(:,jc+j) = c2*((a(:,ia+i)+a(:,ic+i))-(a(:,ib+i)+a(:,id+i))) - s2*((b(:,ia+i)-b(:,ic+i))& 3529 -(b(:,ib+i)-b(:,id+i))) 3530 d(:,jc+j) = s2*((a(:,ia+i)+a(:,ic+i))-(a(:,ib+i)+a(:,id+i))) + c2*((b(:,ia+i)-b(:,ic+i))& 3531 -(b(:,ib+i)-b(:,id+i))) 3532 c(:,jb+j) = c1*((a(:,ia+i)-a(:,ic+i))-(b(:,ib+i)+b(:,id+i))) - s1*((b(:,ia+i)+b(:,ic+i))& 3533 +(a(:,ib+i)-a(:,id+i))) 3534 d(:,jb+j) = s1*((a(:,ia+i)-a(:,ic+i))-(b(:,ib+i)+b(:,id+i))) + c1*((b(:,ia+i)+b(:,ic+i))& 3535 +(a(:,ib+i)-a(:,id+i))) 3536 c(:,jd+j) = c3*((a(:,ia+i)-a(:,ic+i))+(b(:,ib+i)+b(:,id+i))) - s3*((b(:,ia+i)+b(:,ic+i))& 3537 -(a(:,ib+i)-a(:,id+i))) 3538 d(:,jd+j) = s3*((a(:,ia+i)-a(:,ic+i))+(b(:,ib+i)+b(:,id+i))) + c3*((b(:,ia+i)+b(:,ic+i))& 3539 -(a(:,ib+i)-a(:,id+i))) 3540 ibase = ibase + inc1 3541 jbase = jbase + inc2 3542 ENDDO 3543 ia = ia + iink 3544 ib = ib + iink 3545 ic = ic - iink 3546 id = id - iink 3547 jbase = jbase + jump 3548 ENDDO 3549 3550 IF ( ib > ic ) RETURN 3551 3552 ENDIF 3553 3554 ibase = 0 3555 sin45 = SQRT( 0.5_wp ) 3556 DO l = 1, la 3557 i = ibase 3558 j = jbase 3559 c(:,ja+j) = a(:,ia+i) + a(:,ib+i) 3560 c(:,jb+j) = sin45*((a(:,ia+i)-a(:,ib+i))-(b(:,ia+i)+b(:,ib+i))) 3561 c(:,jc+j) = b(:,ib+i) - b(:,ia+i) 3562 c(:,jd+j) = -sin45*((a(:,ia+i)-a(:,ib+i))+(b(:,ia+i)+b(:,ib+i))) 3563 ibase = ibase + inc1 3564 jbase = jbase + inc2 3565 ENDDO 3566 3567 ELSE 3568 3569 DO l = 1, la 3570 i = ibase 3571 j = jbase 3572 c(:,ja+j) = 2.0_wp*((a(:,ia+i)+a(:,ic+i))+a(:,ib+i)) 3573 c(:,jb+j) = 2.0_wp*((a(:,ia+i)-a(:,ic+i))-b(:,ib+i)) 3574 c(:,jc+j) = 2.0_wp*((a(:,ia+i)+a(:,ic+i))-a(:,ib+i)) 3575 c(:,jd+j) = 2.0_wp*((a(:,ia+i)-a(:,ic+i))+b(:,ib+i)) 3576 ibase = ibase + inc1 3577 jbase = jbase + inc2 3578 ENDDO 3579 3580 ENDIF 3581 3582 ! 3583 !-- Coding for factor 5 3584 CASE ( 4 ) 3585 3586 ia = 1 3587 ib = ia + (2*m-la) * inc1 3588 ic = ib + 2*m*inc1 3589 id = ic 3590 ie = ib 3591 ja = 1 3592 jb = ja + jink 3593 jc = jb + jink 3594 jd = jc + jink 3595 je = jd + jink 3596 3597 IF ( la /= m ) THEN 3598 3599 DO l = 1, la 3600 i = ibase 3601 j = jbase 3602 c(:,ja+j) = a(:,ia+i) + (a(:,ib+i)+a(:,ic+i)) 3603 c(:,jb+j) = ((a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))+qrt5*(a(:,ib+i)-a(:,ic+i))) - & 3604 (sin72*b(:,ib+i)+sin36*b(:,ic+i)) 3605 c(:,jc+j) = ((a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))-qrt5*(a(:,ib+i)-a(:,ic+i))) - & 3606 (sin36*b(:,ib+i)-sin72*b(:,ic+i)) 3607 c(:,jd+j) = ((a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))-qrt5*(a(:,ib+i)-a(:,ic+i))) + & 3608 (sin36*b(:,ib+i)-sin72*b(:,ic+i)) 3609 c(:,je+j) = ((a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))+qrt5*(a(:,ib+i)-a(:,ic+i))) + & 3610 (sin72*b(:,ib+i)+sin36*b(:,ic+i)) 3611 ibase = ibase + inc1 3612 jbase = jbase + inc2 3613 ENDDO 3614 ia = ia + iink 3615 iink = 2 * iink 3616 ib = ib + iink 3617 ic = ic + iink 3618 id = id - iink 3619 ie = ie - iink 3620 jbase = jbase + jump 3621 jump = 2 * jump + jink 3622 3623 IF ( ib /= id ) THEN 3624 3625 DO k = la, kstop, la 3626 kb = k + k 3627 kc = kb + kb 3628 kd = kc + kb 3629 ke = kd + kb 3630 c1 = trigs(kb+1) 3631 s1 = trigs(kb+2) 3632 c2 = trigs(kc+1) 3633 s2 = trigs(kc+2) 3634 c3 = trigs(kd+1) 3635 s3 = trigs(kd+2) 3636 c4 = trigs(ke+1) 3637 s4 = trigs(ke+2) 3638 ibase = 0 3639 DO l = 1, la 3640 i = ibase 3641 j = jbase 3642 !DIR$ IVDEP 3643 DO ipl = 1, SIZE(a,1) 3644 a10 = (a(ipl,ia+i)-0.25_wp*((a(ipl,ib+i)+a(ipl,ie+i))+(a(ipl,ic+i)+a(ipl,id+i)))) + & 3645 qrt5*((a(ipl,ib+i)+a(ipl,ie+i))-(a(ipl,ic+i)+a(ipl,id+i))) 3646 a20 = (a(ipl,ia+i)-0.25_wp*((a(ipl,ib+i)+a(ipl,ie+i))+(a(ipl,ic+i)+a(ipl,id+i)))) - & 3647 qrt5*((a(ipl,ib+i)+a(ipl,ie+i))-(a(ipl,ic+i)+a(ipl,id+i))) 3648 b10 = (b(ipl,ia+i)-0.25_wp*((b(ipl,ib+i)-b(ipl,ie+i))+(b(ipl,ic+i)-b(ipl,id+i)))) + & 3649 qrt5*((b(ipl,ib+i)-b(ipl,ie+i))-(b(ipl,ic+i)-b(ipl,id+i))) 3650 b20 = (b(ipl,ia+i)-0.25_wp*((b(ipl,ib+i)-b(ipl,ie+i))+(b(ipl,ic+i)-b(ipl,id+i)))) - & 3651 qrt5*((b(ipl,ib+i)-b(ipl,ie+i))-(b(ipl,ic+i)-b(ipl,id+i))) 3652 a11 = sin72*(b(ipl,ib+i)+b(ipl,ie+i)) + sin36*(b(ipl,ic+i)+b(ipl,id+i)) 3653 a21 = sin36*(b(ipl,ib+i)+b(ipl,ie+i)) - sin72*(b(ipl,ic+i)+b(ipl,id+i)) 3654 b11 = sin72*(a(ipl,ib+i)-a(ipl,ie+i)) + sin36*(a(ipl,ic+i)-a(ipl,id+i)) 3655 b21 = sin36*(a(ipl,ib+i)-a(ipl,ie+i)) - sin72*(a(ipl,ic+i)-a(ipl,id+i)) 3656 c(ipl,ja+j) = a(ipl,ia+i) + ((a(ipl,ib+i)+a(ipl,ie+i))+(a(ipl,ic+i)+a(ipl,id+i))) 3657 d(ipl,ja+j) = b(ipl,ia+i) + ((b(ipl,ib+i)-b(ipl,ie+i))+(b(ipl,ic+i)-b(ipl,id+i))) 3658 c(ipl,jb+j) = c1*(a10 -a11 ) - s1*(b10 +b11 ) 3659 d(ipl,jb+j) = s1*(a10 -a11 ) + c1*(b10 +b11 ) 3660 c(ipl,je+j) = c4*(a10 +a11 ) - s4*(b10 -b11 ) 3661 d(ipl,je+j) = s4*(a10 +a11 ) + c4*(b10 -b11 ) 3662 c(ipl,jc+j) = c2*(a20 -a21 ) - s2*(b20 +b21 ) 3663 d(ipl,jc+j) = s2*(a20 -a21 ) + c2*(b20 +b21 ) 3664 c(ipl,jd+j) = c3*(a20 +a21 ) - s3*(b20 -b21 ) 3665 d(ipl,jd+j) = s3*(a20 +a21 ) + c3*(b20 -b21 ) 3666 ENDDO 3667 ibase = ibase + inc1 3668 jbase = jbase + inc2 3669 ENDDO 3670 ia = ia + iink 3671 ib = ib + iink 3672 ic = ic + iink 3673 id = id - iink 3674 ie = ie - iink 3675 jbase = jbase + jump 3676 ENDDO 3677 3678 IF ( ib > id ) RETURN 3679 3680 ENDIF 3681 3682 ibase = 0 3683 DO l = 1, la 3684 i = ibase 3685 j = jbase 3686 c(:,ja+j) = (a(:,ia+i)+a(:,ib+i)) + a(:,ic+i) 3687 c(:,jb+j) = (qrt5*(a(:,ia+i)-a(:,ib+i))+(0.25_wp*(a(:,ia+i)+a(:,ib+i))-a(:,ic+i))) - & 3688 (sin36*b(:,ia+i)+sin72*b(:,ib+i)) 3689 c(:,je+j) = -(qrt5*(a(:,ia+i)-a(:,ib+i))+(0.25_wp*(a(:,ia+i)+a(:,ib+i))-a(:,ic+i))) - & 3690 (sin36*b(:,ia+i)+sin72*b(:,ib+i)) 3691 c(:,jc+j) = (qrt5*(a(:,ia+i)-a(:,ib+i))-(0.25_wp*(a(:,ia+i)+a(:,ib+i))-a(:,ic+i))) - & 3692 (sin72*b(:,ia+i)-sin36*b(:,ib+i)) 3693 c(:,jd+j) = -(qrt5*(a(:,ia+i)-a(:,ib+i))-(0.25_wp*(a(:,ia+i)+a(:,ib+i))-a(:,ic+i))) - & 3694 (sin72*b(:,ia+i)-sin36*b(:,ib+i)) 3695 ibase = ibase + inc1 3696 jbase = jbase + inc2 3697 ENDDO 3698 3699 ELSE 3700 3701 qqrt5 = 2.0_wp * qrt5 3702 ssin36 = 2.0_wp * sin36 3703 ssin72 = 2.0_wp * sin72 3704 DO l = 1, la 3705 i = ibase 3706 j = jbase 3707 c(:,ja+j) = 2.0_wp*(a(:,ia+i)+(a(:,ib+i)+a(:,ic+i))) 3708 c(:,jb+j) = (2.0_wp*(a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))+qqrt5*(a(:,ib+i)-a(:,ic+i))) & 3709 - (ssin72*b(:,ib+i)+ssin36*b(:,ic+i)) 3710 c(:,jc+j) = (2.0_wp*(a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))-qqrt5*(a(:,ib+i)-a(:,ic+i))) & 3711 - (ssin36*b(:,ib+i)-ssin72*b(:,ic+i)) 3712 c(:,jd+j) = (2.0_wp*(a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))-qqrt5*(a(:,ib+i)-a(:,ic+i))) & 3713 + (ssin36*b(:,ib+i)-ssin72*b(:,ic+i)) 3714 c(:,je+j) = (2.0_wp*(a(:,ia+i)-0.25_wp*(a(:,ib+i)+a(:,ic+i)))+qqrt5*(a(:,ib+i)-a(:,ic+i))) & 3715 + (ssin72*b(:,ib+i)+ssin36*b(:,ic+i)) 3716 ibase = ibase + inc1 3717 jbase = jbase + inc2 3718 ENDDO 3719 3720 ENDIF 3721 3722 ! 3723 !-- Coding for factor 6 3724 CASE ( 5 ) 3725 3726 ia = 1 3727 ib = ia + (2*m-la) * inc1 3728 ic = ib + 2*m*inc1 3729 id = ic + 2*m*inc1 3730 ie = ic 3731 if = ib 3732 ja = 1 3733 jb = ja + jink 3734 jc = jb + jink 3735 jd = jc + jink 3736 je = jd + jink 3737 jf = je + jink 3738 3739 IF ( la /= m ) THEN 3740 3741 DO l = 1, la 3742 i = ibase 3743 j = jbase 3744 c(:,ja+j) = (a(:,ia+i)+a(:,id+i)) + (a(:,ib+i)+a(:,ic+i)) 3745 c(:,jd+j) = (a(:,ia+i)-a(:,id+i)) - (a(:,ib+i)-a(:,ic+i)) 3746 c(:,jb+j) = ((a(:,ia+i)-a(:,id+i))+0.5_wp*(a(:,ib+i)-a(:,ic+i))) - (sin60*(b(:,ib+i)+b(:,ic+i))) 3747 c(:,jf+j) = ((a(:,ia+i)-a(:,id+i))+0.5_wp*(a(:,ib+i)-a(:,ic+i))) + (sin60*(b(:,ib+i)+b(:,ic+i))) 3748 c(:,jc+j) = ((a(:,ia+i)+a(:,id+i))-0.5_wp*(a(:,ib+i)+a(:,ic+i))) - (sin60*(b(:,ib+i)-b(:,ic+i))) 3749 c(:,je+j) = ((a(:,ia+i)+a(:,id+i))-0.5_wp*(a(:,ib+i)+a(:,ic+i))) + (sin60*(b(:,ib+i)-b(:,ic+i))) 3750 ibase = ibase + inc1 3751 jbase = jbase + inc2 3752 ENDDO 3753 ia = ia + iink 3754 iink = 2 * iink 3755 ib = ib + iink 3756 ic = ic + iink 3757 id = id - iink 3758 ie = ie - iink 3759 if = if - iink 3760 jbase = jbase + jump 3761 jump = 2 * jump + jink 3762 3763 IF ( ic /= id ) THEN 3764 3765 DO k = la, kstop, la 3766 kb = k + k 3767 kc = kb + kb 3768 kd = kc + kb 3769 ke = kd + kb 3770 kf = ke + kb 3771 c1 = trigs(kb+1) 3772 s1 = trigs(kb+2) 3773 c2 = trigs(kc+1) 3774 s2 = trigs(kc+2) 3775 c3 = trigs(kd+1) 3776 s3 = trigs(kd+2) 3777 c4 = trigs(ke+1) 3778 s4 = trigs(ke+2) 3779 c5 = trigs(kf+1) 3780 s5 = trigs(kf+2) 3781 ibase = 0 3782 DO l = 1, la 3783 i = ibase 3784 j = jbase 3785 DO ipl = 1, SIZE(a,1) 3786 a11 = (a(ipl,ie+i)+a(ipl,ib+i)) + (a(ipl,ic+i)+a(ipl,if+i)) 3787 a20 = (a(ipl,ia+i)+a(ipl,id+i)) - 0.5_wp*a11 3788 a21 = sin60*((a(ipl,ie+i)+a(ipl,ib+i))-(a(ipl,ic+i)+a(ipl,if+i))) 3789 b11 = (b(ipl,ib+i)-b(ipl,ie+i)) + (b(ipl,ic+i)-b(ipl,if+i)) 3790 b20 = (b(ipl,ia+i)-b(ipl,id+i)) - 0.5_wp*b11 3791 b21 = sin60*((b(ipl,ib+i)-b(ipl,ie+i))-(b(ipl,ic+i)-b(ipl,if+i))) 3792 3793 c(ipl,ja+j) = (a(ipl,ia+i)+a(ipl,id+i)) + a11 3794 d(ipl,ja+j) = (b(ipl,ia+i)-b(ipl,id+i)) + b11 3795 c(ipl,jc+j) = c2*(a20 -b21 ) - s2*(b20 +a21 ) 3796 d(ipl,jc+j) = s2*(a20 -b21 ) + c2*(b20 +a21 ) 3797 c(ipl,je+j) = c4*(a20 +b21 ) - s4*(b20 -a21 ) 3798 d(ipl,je+j) = s4*(a20 +b21 ) + c4*(b20 -a21 ) 3799 3800 a11 = (a(ipl,ie+i)-a(ipl,ib+i)) + (a(ipl,ic+i)-a(ipl,if+i)) 3801 b11 = (b(ipl,ie+i)+b(ipl,ib+i)) - (b(ipl,ic+i)+b(ipl,if+i)) 3802 a20 = (a(ipl,ia+i)-a(ipl,id+i)) - 0.5_wp*a11 3803 a21 = sin60*((a(ipl,ie+i)-a(ipl,ib+i))-(a(ipl,ic+i)-a(ipl,if+i))) 3804 b20 = (b(ipl,ia+i)+b(ipl,id+i)) + 0.5_wp*b11 3805 b21 = sin60*((b(ipl,ie+i)+b(ipl,ib+i))+(b(ipl,ic+i)+b(ipl,if+i))) 3806 3807 c(ipl,jd+j) = c3*((a(ipl,ia+i)-a(ipl,id+i))+a11 ) - s3*((b(ipl,ia+i)+b(ipl,id+i))-b11 ) 3808 d(ipl,jd+j) = s3*((a(ipl,ia+i)-a(ipl,id+i))+a11 ) + c3*((b(ipl,ia+i)+b(ipl,id+i))-b11 ) 3809 c(ipl,jb+j) = c1*(a20 -b21 ) - s1*(b20 -a21 ) 3810 d(ipl,jb+j) = s1*(a20 -b21 ) + c1*(b20 -a21 ) 3811 c(ipl,jf+j) = c5*(a20 +b21 ) - s5*(b20 +a21 ) 3812 d(ipl,jf+j) = s5*(a20 +b21 ) + c5*(b20 +a21 ) 3813 3814 ENDDO 3815 ibase = ibase + inc1 3816 jbase = jbase + inc2 3817 ENDDO 3818 ia = ia + iink 3819 ib = ib + iink 3820 ic = ic + iink 3821 id = id - iink 3822 ie = ie - iink 3823 if = if - iink 3824 jbase = jbase + jump 3825 ENDDO 3826 3827 IF ( ic > id ) RETURN 3828 3829 ENDIF 3830 3831 ibase = 0 3832 DO l = 1, la 3833 i = ibase 3834 j = jbase 3835 DO ipl = 1, SIZE(a,1) 3836 c(ipl,ja+j) = a(ipl,ib+i) + (a(ipl,ia+i)+a(ipl,ic+i)) 3837 c(ipl,jd+j) = b(ipl,ib+i) - (b(ipl,ia+i)+b(ipl,ic+i)) 3838 c(ipl,jb+j) = (sin60*(a(ipl,ia+i)-a(ipl,ic+i))) - (0.5_wp*(b(ipl,ia+i)+b(ipl,ic+i))+b(ipl,ib+i)) 3839 c(ipl,jf+j) = -(sin60*(a(ipl,ia+i)-a(ipl,ic+i))) - (0.5_wp*(b(ipl,ia+i)+b(ipl,ic+i))+b(ipl,ib+i)) 3840 c(ipl,jc+j) = sin60*(b(ipl,ic+i)-b(ipl,ia+i)) + (0.5_wp*(a(ipl,ia+i)+a(ipl,ic+i))-a(ipl,ib+i)) 3841 c(ipl,je+j) = sin60*(b(ipl,ic+i)-b(ipl,ia+i)) - (0.5_wp*(a(ipl,ia+i)+a(ipl,ic+i))-a(ipl,ib+i)) 3842 ENDDO 3843 ibase = ibase + inc1 3844 jbase = jbase + inc2 3845 ENDDO 3846 3847 ELSE 3848 3849 ssin60 = 2.0_wp * sin60 3850 DO l = 1, la 3851 i = ibase 3852 j = jbase 3853 c(:,ja+j) = (2.0_wp*(a(:,ia+i)+a(:,id+i))) + (2.0_wp*(a(:,ib+i)+a(:,ic+i))) 3854 c(:,jd+j) = (2.0_wp*(a(:,ia+i)-a(:,id+i))) - (2.0_wp*(a(:,ib+i)-a(:,ic+i))) 3855 c(:,jb+j) = (2.0_wp*(a(:,ia+i)-a(:,id+i))+(a(:,ib+i)-a(:,ic+i))) - (ssin60*(b(:,ib+i)+b(:,ic+i))) 3856 c(:,jf+j) = (2.0_wp*(a(:,ia+i)-a(:,id+i))+(a(:,ib+i)-a(:,ic+i))) + (ssin60*(b(:,ib+i)+b(:,ic+i))) 3857 c(:,jc+j) = (2.0_wp*(a(:,ia+i)+a(:,id+i))-(a(:,ib+i)+a(:,ic+i))) - (ssin60*(b(:,ib+i)-b(:,ic+i))) 3858 c(:,je+j) = (2.0_wp*(a(:,ia+i)+a(:,id+i))-(a(:,ib+i)+a(:,ic+i))) + (ssin60*(b(:,ib+i)-b(:,ic+i))) 3859 ibase = ibase + inc1 3860 jbase = jbase + inc2 3861 ENDDO 3862 3863 ENDIF 3864 3865 ! 3866 !-- Coding for factor 8 3867 CASE ( 6 ) 3868 3869 IF ( la /= m ) THEN 3870 ierr = 3 3871 RETURN 3872 ENDIF 3873 3874 ia = 1 3875 ib = ia + la*inc1 3876 ic = ib + 2*la*inc1 3877 id = ic + 2*la*inc1 3878 ie = id + 2*la*inc1 3879 ja = 1 3880 jb = ja + jink 3881 jc = jb + jink 3882 jd = jc + jink 3883 je = jd + jink 3884 jf = je + jink 3885 jg = jf + jink 3886 jh = jg + jink 3887 ssin45 = SQRT( 2.0_wp ) 3888 3889 DO l = 1, la 3890 i = ibase 3891 j = jbase 3892 c(:,ja+j) = 2.0_wp*(((a(:,ia+i)+a(:,ie+i))+a(:,ic+i))+(a(:,ib+i)+a(:,id+i))) 3893 c(:,je+j) = 2.0_wp*(((a(:,ia+i)+a(:,ie+i))+a(:,ic+i))-(a(:,ib+i)+a(:,id+i))) 3894 c(:,jc+j) = 2.0_wp*(((a(:,ia+i)+a(:,ie+i))-a(:,ic+i))-(b(:,ib+i)-b(:,id+i))) 3895 c(:,jg+j) = 2.0_wp*(((a(:,ia+i)+a(:,ie+i))-a(:,ic+i))+(b(:,ib+i)-b(:,id+i))) 3896 c(:,jb+j) = 2.0_wp*((a(:,ia+i)-a(:,ie+i))-b(:,ic+i)) + ssin45*((a(:,ib+i)-a(:,id+i))-(b(:,ib+i)+b(:,id+i))) 3897 c(:,jf+j) = 2.0_wp*((a(:,ia+i)-a(:,ie+i))-b(:,ic+i)) - ssin45*((a(:,ib+i)-a(:,id+i))-(b(:,ib+i)+b(:,id+i))) 3898 c(:,jd+j) = 2.0_wp*((a(:,ia+i)-a(:,ie+i))+b(:,ic+i)) - ssin45*((a(:,ib+i)-a(:,id+i))+(b(:,ib+i)+b(:,id+i))) 3899 c(:,jh+j) = 2.0_wp*((a(:,ia+i)-a(:,ie+i))+b(:,ic+i)) + ssin45*((a(:,ib+i)-a(:,id+i))+(b(:,ib+i)+b(:,id+i))) 3900 ibase = ibase + inc1 3901 jbase = jbase + inc2 3902 ENDDO 3903 3904 END SELECT 3905 3906 END SUBROUTINE rpassm_vec 3907 2199 3908 END MODULE temperton_fft -
palm/trunk/SOURCE/transpose.f90
r4360 r4366 25 25 ! ----------------- 26 26 ! $Id$ 27 ! modifications for NEC vectorization 28 ! 29 ! 4360 2020-01-07 11:25:50Z suehring 27 30 ! Added missing OpenMP directives 28 31 ! … … 266 269 ONLY: cpu_log, cpu_log_nowait, log_point_s 267 270 271 USE fft_xy, & 272 ONLY: f_vec, temperton_fft_vec 273 268 274 USE indices, & 269 275 ONLY: nnx, nx, nxl, nxr, nyn, nys, nz … … 282 288 INTEGER(iwp) :: k !< 283 289 INTEGER(iwp) :: l !< 290 INTEGER(iwp) :: mm !< 284 291 INTEGER(iwp) :: xs !< 285 292 … … 292 299 #endif 293 300 294 295 ! 296 !-- If the PE grid is one-dimensional along y, the array has only to be 297 !-- reordered locally and therefore no transposition has to be done. 301 ! 302 !-- If the PE grid is one-dimensional along y, the array has only to be 303 !-- reordered locally and therefore no transposition has to be done. 298 304 IF ( pdims(1) /= 1 ) THEN 299 305 300 306 #if defined( __parallel ) 301 307 ! 302 !-- Reorder input array for transposition 303 !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) 304 DO l = 0, pdims(1) - 1 305 xs = 0 + l * nnx 306 #if __acc_fft_device 307 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 308 !$ACC PRESENT(work, f_in) 309 #endif 310 !$OMP DO 311 DO k = nzb_x, nzt_x 312 DO i = xs, xs + nnx - 1 313 DO j = nys_x, nyn_x 314 work(j,i-xs+1,k,l) = f_in(i,j,k) 308 !-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in 309 !-- different array format (f_vec). 310 IF ( temperton_fft_vec ) THEN 311 312 DO l = 0, pdims(1) - 1 313 xs = 0 + l * nnx 314 DO k = nzb_x, nzt_x 315 DO i = xs, xs + nnx - 1 316 DO j = nys_x, nyn_x 317 mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) 318 work(j,i-xs+1,k,l) = f_vec(mm,i) 319 ENDDO 315 320 ENDDO 316 321 ENDDO 317 322 ENDDO 318 !$OMP END DO NOWAIT 319 ENDDO 320 !$OMP END PARALLEL 323 324 ELSE 325 326 !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) 327 DO l = 0, pdims(1) - 1 328 xs = 0 + l * nnx 329 #if __acc_fft_device 330 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 331 !$ACC PRESENT(work, f_in) 332 #endif 333 !$OMP DO 334 DO k = nzb_x, nzt_x 335 DO i = xs, xs + nnx - 1 336 DO j = nys_x, nyn_x 337 work(j,i-xs+1,k,l) = f_in(i,j,k) 338 ENDDO 339 ENDDO 340 ENDDO 341 !$OMP END DO NOWAIT 342 ENDDO 343 !$OMP END PARALLEL 344 345 ENDIF 321 346 322 347 ! … … 836 861 ONLY: cpu_log, cpu_log_nowait, log_point_s 837 862 863 USE fft_xy, & 864 ONLY: f_vec, temperton_fft_vec 865 838 866 USE indices, & 839 867 ONLY: nnx, nx, nxl, nxr, nyn, nys, nz … … 852 880 INTEGER(iwp) :: k !< 853 881 INTEGER(iwp) :: l !< 882 INTEGER(iwp) :: mm !< 854 883 INTEGER(iwp) :: xs !< 855 884 … … 914 943 915 944 ! 916 !-- Reorder transposed array 917 ! $OMP PARALLEL PRIVATE ( i, j, k, l, xs )918 DO l = 0, pdims(1) - 1 919 xs = 0 + l * nnx920 #if __acc_fft_device 921 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &922 !$ACC PRESENT(f_out, work)923 #endif 924 !$OMP DO925 DO k = nzb_x, nzt_x926 DO i = xs, xs + nnx - 1927 DO j = nys_x, nyn_x928 f_out(i,j,k) = work(j,i-xs+1,k,l)945 !-- Reorder transposed array. 946 !-- Data for the vectorized Temperton-fft is stored in different array format (f_vec) which saves 947 !-- additional data copy in fft_x. 948 IF ( temperton_fft_vec ) THEN 949 950 DO l = 0, pdims(1) - 1 951 xs = 0 + l * nnx 952 DO k = nzb_x, nzt_x 953 DO i = xs, xs + nnx - 1 954 DO j = nys_x, nyn_x 955 mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) 956 f_vec(mm,i) = work(j,i-xs+1,k,l) 957 ENDDO 929 958 ENDDO 930 959 ENDDO 931 960 ENDDO 932 !$OMP END DO NOWAIT 933 ENDDO 934 !$OMP END PARALLEL 935 #endif 961 962 ELSE 963 964 !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) 965 DO l = 0, pdims(1) - 1 966 xs = 0 + l * nnx 967 #if __acc_fft_device 968 !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & 969 !$ACC PRESENT(f_out, work) 970 #endif 971 !$OMP DO 972 DO k = nzb_x, nzt_x 973 DO i = xs, xs + nnx - 1 974 DO j = nys_x, nyn_x 975 f_out(i,j,k) = work(j,i-xs+1,k,l) 976 ENDDO 977 ENDDO 978 ENDDO 979 !$OMP END DO NOWAIT 980 ENDDO 981 !$OMP END PARALLEL 982 #endif 983 984 ENDIF 936 985 937 986 ENDIF
Note: See TracChangeset
for help on using the changeset viewer.