Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4406

Last change on this file since 4406 was 4370, checked in by raasch, 5 years ago
bugfixes for previous commit: unused variables removed, Temperton-fft usage on GPU, openacc porting of vector version of Obukhov length calculation, collective read switched off on NEC to avoid hanging; some vector directives added in prognostic equations to force vectorization on Intel19 compiler, configuration files for NEC Aurora added
Property svn:keywords set to `Id`
File size: 34.1 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2020 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4370 2020-01-10 14:00:44Z knoop $
27	! vector array renamed
28	!
29	! 4366 2020-01-09 08:12:43Z raasch
30	! modifications for NEC vectorization
31	!
32	! 4360 2020-01-07 11:25:50Z suehring
33	! Added missing OpenMP directives
34	!
35	! 4182 2019-08-22 15:20:23Z scharf
36	! Corrected "Former revisions" section
37	!
38	! 4171 2019-08-19 17:44:09Z gronemeier
39	! loop reordering for performance optimization
40	!
41	! 3832 2019-03-28 13:16:58Z raasch
42	! loop reordering for performance optimization
43	!
44	! 3694 2019-01-23 17:01:49Z knoop
45	! OpenACC port for SPEC
46	!
47	! Revision 1.1 1997/07/24 11:25:18 raasch
48	! Initial revision
49	!
50	!
51	! Description:
52	! ------------
53	!> Resorting data for the transposition from x to y. The transposition itself
54	!> is carried out in transpose_xy
55	!------------------------------------------------------------------------------!
56
57	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
58
59	SUBROUTINE resort_for_xy( f_in, f_inv )
60
61
62	USE indices, &
63	ONLY: nx
64
65	USE kinds
66
67	USE transpose_indices, &
68	ONLY: nyn_x, nys_x, nzb_x, nzt_x
69
70	IMPLICIT NONE
71
72	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
73	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
74
75
76	INTEGER(iwp) :: i !<
77	INTEGER(iwp) :: j !<
78	INTEGER(iwp) :: k !<
79	!
80	!-- Rearrange indices of input array in order to make data to be send
81	!-- by MPI contiguous
82	!$OMP PARALLEL PRIVATE ( i, j, k )
83	!$OMP DO
84	#if __acc_fft_device
85	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
86	!$ACC PRESENT(f_inv, f_in)
87	#endif
88	DO k = nzb_x, nzt_x
89	DO j = nys_x, nyn_x
90	DO i = 0, nx
91	f_inv(j,k,i) = f_in(i,j,k)
92	ENDDO
93	ENDDO
94	ENDDO
95	!$OMP END PARALLEL
96
97	END SUBROUTINE resort_for_xy
98
99
100	!------------------------------------------------------------------------------!
101	! Description:
102	! ------------
103	!> Transposition of input array (f_in) from x to y. For the input array, all
104	!> elements along x reside on the same PE, while after transposition, all
105	!> elements along y reside on the same PE.
106	!------------------------------------------------------------------------------!
107	SUBROUTINE transpose_xy( f_inv, f_out )
108
109
110	USE cpulog, &
111	ONLY: cpu_log, cpu_log_nowait, log_point_s
112
113	USE indices, &
114	ONLY: nx, ny
115
116	USE kinds
117
118	USE pegrid
119
120	USE transpose_indices, &
121	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
122
123	IMPLICIT NONE
124
125	INTEGER(iwp) :: i !<
126	INTEGER(iwp) :: j !<
127	INTEGER(iwp) :: k !<
128	INTEGER(iwp) :: l !<
129	INTEGER(iwp) :: ys !<
130
131	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
132	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
133
134	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
135	#if __acc_fft_device
136	!$ACC DECLARE CREATE(work)
137	#endif
138
139
140	IF ( numprocs /= 1 ) THEN
141
142	#if defined( __parallel )
143	!
144	!-- Transpose array
145	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
146
147	#if __acc_fft_device
148	#ifndef __cuda_aware_mpi
149	!$ACC UPDATE HOST(f_inv)
150	#else
151	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
152	#endif
153	#endif
154
155	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
156	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
157	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
158	comm1dy, ierr )
159
160	#if __acc_fft_device
161	#ifndef __cuda_aware_mpi
162	!$ACC UPDATE DEVICE(work)
163	#else
164	!$ACC END HOST_DATA
165	#endif
166	#endif
167
168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
169
170	!
171	!-- Reorder transposed array
172	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
173	DO l = 0, pdims(2) - 1
174	ys = 0 + l * ( nyn_x - nys_x + 1 )
175	#if __acc_fft_device
176	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
177	!$ACC PRESENT(f_out, work)
178	#endif
179	!$OMP DO
180	DO i = nxl_y, nxr_y
181	DO k = nzb_y, nzt_y
182	DO j = ys, ys + nyn_x - nys_x
183	f_out(j,i,k) = work(j-ys+1,k,i,l)
184	ENDDO
185	ENDDO
186	ENDDO
187	!$OMP END DO NOWAIT
188	ENDDO
189	!$OMP END PARALLEL
190	#endif
191
192	ELSE
193
194	!
195	!-- Reorder transposed array
196	!$OMP PARALLEL PRIVATE ( i, j, k )
197	!$OMP DO
198	#if __acc_fft_device
199	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
200	!$ACC PRESENT(f_out, f_inv)
201	#endif
202	DO k = nzb_y, nzt_y
203	DO i = nxl_y, nxr_y
204	DO j = 0, ny
205	f_out(j,i,k) = f_inv(j,k,i)
206	ENDDO
207	ENDDO
208	ENDDO
209	!$OMP END PARALLEL
210
211	ENDIF
212
213	END SUBROUTINE transpose_xy
214
215
216	!------------------------------------------------------------------------------!
217	! Description:
218	! ------------
219	!> Resorting data after the transposition from x to z. The transposition itself
220	!> is carried out in transpose_xz
221	!------------------------------------------------------------------------------!
222	SUBROUTINE resort_for_xz( f_inv, f_out )
223
224
225	USE indices, &
226	ONLY: nxl, nxr, nyn, nys, nz
227
228	USE kinds
229
230	IMPLICIT NONE
231
232	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
233	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
234
235	INTEGER(iwp) :: i !<
236	INTEGER(iwp) :: j !<
237	INTEGER(iwp) :: k !<
238	!
239	!-- Rearrange indices of input array in order to make data to be send
240	!-- by MPI contiguous.
241	!-- In case of parallel fft/transposition, scattered store is faster in
242	!-- backward direction!!!
243	!$OMP PARALLEL PRIVATE ( i, j, k )
244	!$OMP DO
245	#if __acc_fft_device
246	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
247	!$ACC PRESENT(f_out, f_inv)
248	#endif
249	DO i = nxl, nxr
250	DO j = nys, nyn
251	DO k = 1, nz
252	f_out(k,j,i) = f_inv(j,i,k)
253	ENDDO
254	ENDDO
255	ENDDO
256	!$OMP END PARALLEL
257
258	END SUBROUTINE resort_for_xz
259
260
261	!------------------------------------------------------------------------------!
262	! Description:
263	! ------------
264	!> Transposition of input array (f_in) from x to z. For the input array, all
265	!> elements along x reside on the same PE, while after transposition, all
266	!> elements along z reside on the same PE.
267	!------------------------------------------------------------------------------!
268	SUBROUTINE transpose_xz( f_in, f_inv )
269
270
271	USE cpulog, &
272	ONLY: cpu_log, cpu_log_nowait, log_point_s
273
274	USE fft_xy, &
275	ONLY: f_vec_x, temperton_fft_vec
276
277	USE indices, &
278	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
279
280	USE kinds
281
282	USE pegrid
283
284	USE transpose_indices, &
285	ONLY: nyn_x, nys_x, nzb_x, nzt_x
286
287	IMPLICIT NONE
288
289	INTEGER(iwp) :: i !<
290	INTEGER(iwp) :: j !<
291	INTEGER(iwp) :: k !<
292	INTEGER(iwp) :: l !<
293	INTEGER(iwp) :: mm !<
294	INTEGER(iwp) :: xs !<
295
296	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
297	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
298
299	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
300	#if __acc_fft_device
301	!$ACC DECLARE CREATE(work)
302	#endif
303
304	!
305	!-- If the PE grid is one-dimensional along y, the array has only to be
306	!-- reordered locally and therefore no transposition has to be done.
307	IF ( pdims(1) /= 1 ) THEN
308
309	#if defined( __parallel )
310	!
311	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
312	!-- different array format (f_vec_x).
313	IF ( temperton_fft_vec ) THEN
314
315	DO l = 0, pdims(1) - 1
316	xs = 0 + l * nnx
317	DO k = nzb_x, nzt_x
318	DO i = xs, xs + nnx - 1
319	DO j = nys_x, nyn_x
320	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
321	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
322	ENDDO
323	ENDDO
324	ENDDO
325	ENDDO
326
327	ELSE
328
329	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
330	DO l = 0, pdims(1) - 1
331	xs = 0 + l * nnx
332	#if __acc_fft_device
333	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
334	!$ACC PRESENT(work, f_in)
335	#endif
336	!$OMP DO
337	DO k = nzb_x, nzt_x
338	DO i = xs, xs + nnx - 1
339	DO j = nys_x, nyn_x
340	work(j,i-xs+1,k,l) = f_in(i,j,k)
341	ENDDO
342	ENDDO
343	ENDDO
344	!$OMP END DO NOWAIT
345	ENDDO
346	!$OMP END PARALLEL
347
348	ENDIF
349
350	!
351	!-- Transpose array
352	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
353
354	#if __acc_fft_device
355	#ifndef __cuda_aware_mpi
356	!$ACC UPDATE HOST(work)
357	#else
358	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
359	#endif
360	#endif
361
362	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
363	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
364	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
365	comm1dx, ierr )
366
367	#if __acc_fft_device
368	#ifndef __cuda_aware_mpi
369	!$ACC UPDATE DEVICE(f_inv)
370	#else
371	!$ACC END HOST_DATA
372	#endif
373	#endif
374
375	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
376	#endif
377
378	ELSE
379
380	!
381	!-- Reorder the array in a way that the z index is in first position
382	!$OMP PARALLEL PRIVATE ( i, j, k )
383	!$OMP DO
384	#if __acc_fft_device
385	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
386	!$ACC PRESENT(f_inv, f_in)
387	#endif
388	DO i = nxl, nxr
389	DO j = nys, nyn
390	DO k = 1, nz
391	f_inv(j,i,k) = f_in(i,j,k)
392	ENDDO
393	ENDDO
394	ENDDO
395	!$OMP END PARALLEL
396
397	ENDIF
398
399	END SUBROUTINE transpose_xz
400
401
402	!------------------------------------------------------------------------------!
403	! Description:
404	! ------------
405	!> Resorting data after the transposition from y to x. The transposition itself
406	!> is carried out in transpose_yx
407	!------------------------------------------------------------------------------!
408	SUBROUTINE resort_for_yx( f_inv, f_out )
409
410
411	USE indices, &
412	ONLY: nx
413
414	USE kinds
415
416	USE transpose_indices, &
417	ONLY: nyn_x, nys_x, nzb_x, nzt_x
418
419	IMPLICIT NONE
420
421	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
422	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
423
424
425	INTEGER(iwp) :: i !<
426	INTEGER(iwp) :: j !<
427	INTEGER(iwp) :: k !<
428	!
429	!-- Rearrange indices of input array in order to make data to be send
430	!-- by MPI contiguous
431	!$OMP PARALLEL PRIVATE ( i, j, k )
432	!$OMP DO
433	#if __acc_fft_device
434	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
435	!$ACC PRESENT(f_out, f_inv)
436	#endif
437	DO k = nzb_x, nzt_x
438	DO j = nys_x, nyn_x
439	DO i = 0, nx
440	f_out(i,j,k) = f_inv(j,k,i)
441	ENDDO
442	ENDDO
443	ENDDO
444	!$OMP END PARALLEL
445
446	END SUBROUTINE resort_for_yx
447
448
449	!------------------------------------------------------------------------------!
450	! Description:
451	! ------------
452	!> Transposition of input array (f_in) from y to x. For the input array, all
453	!> elements along y reside on the same PE, while after transposition, all
454	!> elements along x reside on the same PE.
455	!------------------------------------------------------------------------------!
456	SUBROUTINE transpose_yx( f_in, f_inv )
457
458
459	USE cpulog, &
460	ONLY: cpu_log, cpu_log_nowait, log_point_s
461
462	USE indices, &
463	ONLY: nx, ny
464
465	USE kinds
466
467	USE pegrid
468
469	USE transpose_indices, &
470	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
471
472	IMPLICIT NONE
473
474	INTEGER(iwp) :: i !<
475	INTEGER(iwp) :: j !<
476	INTEGER(iwp) :: k !<
477	INTEGER(iwp) :: l !<
478	INTEGER(iwp) :: ys !<
479
480	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
481	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
482
483	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
484	#if __acc_fft_device
485	!$ACC DECLARE CREATE(work)
486	#endif
487
488
489	IF ( numprocs /= 1 ) THEN
490
491	#if defined( __parallel )
492	!
493	!-- Reorder input array for transposition
494	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
495	DO l = 0, pdims(2) - 1
496	ys = 0 + l * ( nyn_x - nys_x + 1 )
497	#if __acc_fft_device
498	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
499	!$ACC PRESENT(work, f_in)
500	#endif
501	!$OMP DO
502	DO i = nxl_y, nxr_y
503	DO k = nzb_y, nzt_y
504	DO j = ys, ys + nyn_x - nys_x
505	work(j-ys+1,k,i,l) = f_in(j,i,k)
506	ENDDO
507	ENDDO
508	ENDDO
509	!$OMP END DO NOWAIT
510	ENDDO
511	!$OMP END PARALLEL
512
513	!
514	!-- Transpose array
515	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
516
517	#if __acc_fft_device
518	#ifndef __cuda_aware_mpi
519	!$ACC UPDATE HOST(work)
520	#else
521	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
522	#endif
523	#endif
524
525	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
526	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
527	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
528	comm1dy, ierr )
529
530	#if __acc_fft_device
531	#ifndef __cuda_aware_mpi
532	!$ACC UPDATE DEVICE(f_inv)
533	#else
534	!$ACC END HOST_DATA
535	#endif
536	#endif
537
538	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
539	#endif
540
541	ELSE
542
543	!
544	!-- Reorder array f_in the same way as ALLTOALL did it
545	!$OMP PARALLEL PRIVATE ( i, j, k )
546	!$OMP DO
547	#if __acc_fft_device
548	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
549	!$ACC PRESENT(f_inv, f_in)
550	#endif
551	DO i = nxl_y, nxr_y
552	DO k = nzb_y, nzt_y
553	DO j = 0, ny
554	f_inv(j,k,i) = f_in(j,i,k)
555	ENDDO
556	ENDDO
557	ENDDO
558	!$OMP END PARALLEL
559
560	ENDIF
561
562	END SUBROUTINE transpose_yx
563
564
565	!------------------------------------------------------------------------------!
566	! Description:
567	! ------------
568	!> Transposition of input array (f_in) from y to x. For the input array, all
569	!> elements along y reside on the same PE, while after transposition, all
570	!> elements along x reside on the same PE.
571	!> This is a direct transposition for arrays with indices in regular order
572	!> (k,j,i) (cf. transpose_yx).
573	!------------------------------------------------------------------------------!
574	SUBROUTINE transpose_yxd( f_in, f_out )
575
576
577	USE cpulog, &
578	ONLY: cpu_log, log_point_s
579
580	USE indices, &
581	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
582
583	USE kinds
584
585	USE pegrid
586
587	USE transpose_indices, &
588	ONLY: nyn_x, nys_x, nzb_x, nzt_x
589
590	IMPLICIT NONE
591
592	INTEGER(iwp) :: i !<
593	INTEGER(iwp) :: j !<
594	INTEGER(iwp) :: k !<
595	INTEGER(iwp) :: l !<
596	INTEGER(iwp) :: m !<
597	INTEGER(iwp) :: xs !<
598
599	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
600	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
601	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
602	REAL(wp) :: work(nnxnnynnz) !<
603	#if defined( __parallel )
604
605	!
606	!-- Rearrange indices of input array in order to make data to be send
607	!-- by MPI contiguous
608	DO k = 1, nz
609	DO j = nys, nyn
610	DO i = nxl, nxr
611	f_inv(i,k,j) = f_in(k,j,i)
612	ENDDO
613	ENDDO
614	ENDDO
615
616	!
617	!-- Transpose array
618	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
619	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
620	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
621	work(1), sendrecvcount_xy, MPI_REAL, &
622	comm1dx, ierr )
623	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
624
625	!
626	!-- Reorder transposed array
627	m = 0
628	DO l = 0, pdims(1) - 1
629	xs = 0 + l * nnx
630	DO j = nys_x, nyn_x
631	DO k = 1, nz
632	DO i = xs, xs + nnx - 1
633	m = m + 1
634	f_out(i,j,k) = work(m)
635	ENDDO
636	ENDDO
637	ENDDO
638	ENDDO
639
640	#endif
641
642	END SUBROUTINE transpose_yxd
643
644
645	!------------------------------------------------------------------------------!
646	! Description:
647	! ------------
648	!> Resorting data for the transposition from y to z. The transposition itself
649	!> is carried out in transpose_yz
650	!------------------------------------------------------------------------------!
651	SUBROUTINE resort_for_yz( f_in, f_inv )
652
653
654	USE indices, &
655	ONLY: ny
656
657	USE kinds
658
659	USE transpose_indices, &
660	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
661
662	IMPLICIT NONE
663
664	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
665	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
666
667	INTEGER(iwp) :: i !<
668	INTEGER(iwp) :: j !<
669	INTEGER(iwp) :: k !<
670
671	!
672	!-- Rearrange indices of input array in order to make data to be send
673	!-- by MPI contiguous
674	!$OMP PARALLEL PRIVATE ( i, j, k )
675	!$OMP DO
676	#if __acc_fft_device
677	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
678	!$ACC PRESENT(f_inv, f_in)
679	#endif
680	DO k = nzb_y, nzt_y
681	DO i = nxl_y, nxr_y
682	DO j = 0, ny
683	f_inv(i,k,j) = f_in(j,i,k)
684	ENDDO
685	ENDDO
686	ENDDO
687	!$OMP END PARALLEL
688
689	END SUBROUTINE resort_for_yz
690
691
692	!------------------------------------------------------------------------------!
693	! Description:
694	! ------------
695	!> Transposition of input array (f_in) from y to z. For the input array, all
696	!> elements along y reside on the same PE, while after transposition, all
697	!> elements along z reside on the same PE.
698	!------------------------------------------------------------------------------!
699	SUBROUTINE transpose_yz( f_inv, f_out )
700
701
702	USE cpulog, &
703	ONLY: cpu_log, cpu_log_nowait, log_point_s
704
705	USE indices, &
706	ONLY: ny, nz
707
708	USE kinds
709
710	USE pegrid
711
712	USE transpose_indices, &
713	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
714
715	IMPLICIT NONE
716
717	INTEGER(iwp) :: i !<
718	INTEGER(iwp) :: j !<
719	INTEGER(iwp) :: k !<
720	INTEGER(iwp) :: l !<
721	INTEGER(iwp) :: zs !<
722
723	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
724	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
725
726	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
727	#if __acc_fft_device
728	!$ACC DECLARE CREATE(work)
729	#endif
730
731
732	!
733	!-- If the PE grid is one-dimensional along y, only local reordering
734	!-- of the data is necessary and no transposition has to be done.
735	IF ( pdims(1) == 1 ) THEN
736
737	!$OMP PARALLEL PRIVATE ( i, j, k )
738	!$OMP DO
739	#if __acc_fft_device
740	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
741	!$ACC PRESENT(f_out, f_inv)
742	#endif
743	DO j = 0, ny
744	DO k = nzb_y, nzt_y
745	DO i = nxl_y, nxr_y
746	f_out(i,j,k) = f_inv(i,k,j)
747	ENDDO
748	ENDDO
749	ENDDO
750	!$OMP END PARALLEL
751
752	ELSE
753
754	#if defined( __parallel )
755	!
756	!-- Transpose array
757	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
758
759	#if __acc_fft_device
760	#ifndef __cuda_aware_mpi
761	!$ACC UPDATE HOST(f_inv)
762	#else
763	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
764	#endif
765	#endif
766
767	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
768	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
769	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
770	comm1dx, ierr )
771
772	#if __acc_fft_device
773	#ifndef __cuda_aware_mpi
774	!$ACC UPDATE DEVICE(work)
775	#else
776	!$ACC END HOST_DATA
777	#endif
778	#endif
779
780	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
781
782	!
783	!-- Reorder transposed array
784	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
785	DO l = 0, pdims(1) - 1
786	zs = 1 + l * ( nzt_y - nzb_y + 1 )
787	#if __acc_fft_device
788	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
789	!$ACC PRESENT(f_out, work)
790	#endif
791	!$OMP DO
792	DO j = nys_z, nyn_z
793	DO k = zs, zs + nzt_y - nzb_y
794	DO i = nxl_z, nxr_z
795	f_out(i,j,k) = work(i,k-zs+1,j,l)
796	ENDDO
797	ENDDO
798	ENDDO
799	!$OMP END DO NOWAIT
800	ENDDO
801	!$OMP END PARALLEL
802	#endif
803
804	ENDIF
805
806	END SUBROUTINE transpose_yz
807
808
809	!------------------------------------------------------------------------------!
810	! Description:
811	! ------------
812	!> Resorting data for the transposition from z to x. The transposition itself
813	!> is carried out in transpose_zx
814	!------------------------------------------------------------------------------!
815	SUBROUTINE resort_for_zx( f_in, f_inv )
816
817
818	USE indices, &
819	ONLY: nxl, nxr, nyn, nys, nz
820
821	USE kinds
822
823	IMPLICIT NONE
824
825	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
826	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
827
828	INTEGER(iwp) :: i !<
829	INTEGER(iwp) :: j !<
830	INTEGER(iwp) :: k !<
831
832	!
833	!-- Rearrange indices of input array in order to make data to be send
834	!-- by MPI contiguous
835	!$OMP PARALLEL PRIVATE ( i, j, k )
836	!$OMP DO
837	#if __acc_fft_device
838	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
839	!$ACC PRESENT(f_in, f_inv)
840	#endif
841	DO i = nxl, nxr
842	DO j = nys, nyn
843	DO k = 1,nz
844	f_inv(j,i,k) = f_in(k,j,i)
845	ENDDO
846	ENDDO
847	ENDDO
848	!$OMP END PARALLEL
849
850	END SUBROUTINE resort_for_zx
851
852
853	!------------------------------------------------------------------------------!
854	! Description:
855	! ------------
856	!> Transposition of input array (f_in) from z to x. For the input array, all
857	!> elements along z reside on the same PE, while after transposition, all
858	!> elements along x reside on the same PE.
859	!------------------------------------------------------------------------------!
860	SUBROUTINE transpose_zx( f_inv, f_out )
861
862
863	USE cpulog, &
864	ONLY: cpu_log, cpu_log_nowait, log_point_s
865
866	USE fft_xy, &
867	ONLY: f_vec_x, temperton_fft_vec
868
869	USE indices, &
870	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
871
872	USE kinds
873
874	USE pegrid
875
876	USE transpose_indices, &
877	ONLY: nyn_x, nys_x, nzb_x, nzt_x
878
879	IMPLICIT NONE
880
881	INTEGER(iwp) :: i !<
882	INTEGER(iwp) :: j !<
883	INTEGER(iwp) :: k !<
884	INTEGER(iwp) :: l !<
885	INTEGER(iwp) :: mm !<
886	INTEGER(iwp) :: xs !<
887
888	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
889	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
890
891	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
892	#if __acc_fft_device
893	!$ACC DECLARE CREATE(work)
894	#endif
895
896
897	!
898	!-- If the PE grid is one-dimensional along y, only local reordering
899	!-- of the data is necessary and no transposition has to be done.
900	IF ( pdims(1) == 1 ) THEN
901
902	!$OMP PARALLEL PRIVATE ( i, j, k )
903	!$OMP DO
904	#if __acc_fft_device
905	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
906	!$ACC PRESENT(f_out, f_inv)
907	#endif
908	DO k = 1, nz
909	DO i = nxl, nxr
910	DO j = nys, nyn
911	f_out(i,j,k) = f_inv(j,i,k)
912	ENDDO
913	ENDDO
914	ENDDO
915	!$OMP END PARALLEL
916
917	ELSE
918
919	#if defined( __parallel )
920	!
921	!-- Transpose array
922	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
923
924	#if __acc_fft_device
925	#ifndef __cuda_aware_mpi
926	!$ACC UPDATE HOST(f_inv)
927	#else
928	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
929	#endif
930	#endif
931
932	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
933	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
934	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
935	comm1dx, ierr )
936
937	#if __acc_fft_device
938	#ifndef __cuda_aware_mpi
939	!$ACC UPDATE DEVICE(work)
940	#else
941	!$ACC END HOST_DATA
942	#endif
943	#endif
944
945	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
946
947	!
948	!-- Reorder transposed array.
949	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
950	!-- saves additional data copy in fft_x.
951	IF ( temperton_fft_vec ) THEN
952
953	DO l = 0, pdims(1) - 1
954	xs = 0 + l * nnx
955	DO k = nzb_x, nzt_x
956	DO i = xs, xs + nnx - 1
957	DO j = nys_x, nyn_x
958	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
959	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
960	ENDDO
961	ENDDO
962	ENDDO
963	ENDDO
964
965	ELSE
966
967	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
968	DO l = 0, pdims(1) - 1
969	xs = 0 + l * nnx
970	#if __acc_fft_device
971	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
972	!$ACC PRESENT(f_out, work)
973	#endif
974	!$OMP DO
975	DO k = nzb_x, nzt_x
976	DO i = xs, xs + nnx - 1
977	DO j = nys_x, nyn_x
978	f_out(i,j,k) = work(j,i-xs+1,k,l)
979	ENDDO
980	ENDDO
981	ENDDO
982	!$OMP END DO NOWAIT
983	ENDDO
984	!$OMP END PARALLEL
985	#endif
986
987	ENDIF
988
989	ENDIF
990
991	END SUBROUTINE transpose_zx
992
993
994	!------------------------------------------------------------------------------!
995	! Description:
996	! ------------
997	!> Resorting data after the transposition from z to y. The transposition itself
998	!> is carried out in transpose_zy
999	!------------------------------------------------------------------------------!
1000	SUBROUTINE resort_for_zy( f_inv, f_out )
1001
1002
1003	USE indices, &
1004	ONLY: ny
1005
1006	USE kinds
1007
1008	USE transpose_indices, &
1009	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1010
1011	IMPLICIT NONE
1012
1013	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1014	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1015
1016
1017	INTEGER(iwp) :: i !<
1018	INTEGER(iwp) :: j !<
1019	INTEGER(iwp) :: k !<
1020
1021	!
1022	!-- Rearrange indices of input array in order to make data to be send
1023	!-- by MPI contiguous
1024	!$OMP PARALLEL PRIVATE ( i, j, k )
1025	!$OMP DO
1026	#if __acc_fft_device
1027	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1028	!$ACC PRESENT(f_out, f_inv)
1029	#endif
1030	DO k = nzb_y, nzt_y
1031	DO i = nxl_y, nxr_y
1032	DO j = 0, ny
1033	f_out(j,i,k) = f_inv(i,k,j)
1034	ENDDO
1035	ENDDO
1036	ENDDO
1037	!$OMP END PARALLEL
1038
1039	END SUBROUTINE resort_for_zy
1040
1041
1042	!------------------------------------------------------------------------------!
1043	! Description:cpu_log_nowait
1044	! ------------
1045	!> Transposition of input array (f_in) from z to y. For the input array, all
1046	!> elements along z reside on the same PE, while after transposition, all
1047	!> elements along y reside on the same PE.
1048	!------------------------------------------------------------------------------!
1049	SUBROUTINE transpose_zy( f_in, f_inv )
1050
1051
1052	USE cpulog, &
1053	ONLY: cpu_log, cpu_log_nowait, log_point_s
1054
1055	USE indices, &
1056	ONLY: ny, nz
1057
1058	USE kinds
1059
1060	USE pegrid
1061
1062	USE transpose_indices, &
1063	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1064
1065	IMPLICIT NONE
1066
1067	INTEGER(iwp) :: i !<
1068	INTEGER(iwp) :: j !<
1069	INTEGER(iwp) :: k !<
1070	INTEGER(iwp) :: l !<
1071	INTEGER(iwp) :: zs !<
1072
1073	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1074	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1075
1076	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1077	#if __acc_fft_device
1078	!$ACC DECLARE CREATE(work)
1079	#endif
1080
1081	!
1082	!-- If the PE grid is one-dimensional along y, the array has only to be
1083	!-- reordered locally and therefore no transposition has to be done.
1084	IF ( pdims(1) /= 1 ) THEN
1085
1086	#if defined( __parallel )
1087	!
1088	!-- Reorder input array for transposition
1089	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1090	DO l = 0, pdims(1) - 1
1091	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1092	#if __acc_fft_device
1093	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1094	!$ACC PRESENT(work, f_in)
1095	#endif
1096	!$OMP DO
1097	DO j = nys_z, nyn_z
1098	DO k = zs, zs + nzt_y - nzb_y
1099	DO i = nxl_z, nxr_z
1100	work(i,k-zs+1,j,l) = f_in(i,j,k)
1101	ENDDO
1102	ENDDO
1103	ENDDO
1104	!$OMP END DO NOWAIT
1105	ENDDO
1106	!$OMP END PARALLEL
1107
1108	!
1109	!-- Transpose array
1110	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1111
1112	#if __acc_fft_device
1113	#ifndef __cuda_aware_mpi
1114	!$ACC UPDATE HOST(work)
1115	#else
1116	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1117	#endif
1118	#endif
1119
1120	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1121	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1122	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1123	comm1dx, ierr )
1124
1125	#if __acc_fft_device
1126	#ifndef __cuda_aware_mpi
1127	!$ACC UPDATE DEVICE(f_inv)
1128	#else
1129	!$ACC END HOST_DATA
1130	#endif
1131	#endif
1132
1133	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1134	#endif
1135
1136	ELSE
1137	!
1138	!-- Reorder the array in the same way like ALLTOALL did it
1139	!$OMP PARALLEL PRIVATE ( i, j, k )
1140	!$OMP DO
1141	#if __acc_fft_device
1142	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1143	!$ACC PRESENT(f_inv, f_in)
1144	#endif
1145	DO k = nzb_y, nzt_y
1146	DO j = 0, ny
1147	DO i = nxl_y, nxr_y
1148	f_inv(i,k,j) = f_in(i,j,k)
1149	ENDDO
1150	ENDDO
1151	ENDDO
1152	!$OMP END PARALLEL
1153
1154	ENDIF
1155
1156	END SUBROUTINE transpose_zy
1157
1158
1159	!------------------------------------------------------------------------------!
1160	! Description:
1161	! ------------
1162	!> Transposition of input array (f_in) from z to y. For the input array, all
1163	!> elements along z reside on the same PE, while after transposition, all
1164	!> elements along y reside on the same PE.
1165	!> This is a direct transposition for arrays with indices in regular order
1166	!> (k,j,i) (cf. transpose_zy).
1167	!------------------------------------------------------------------------------!
1168	SUBROUTINE transpose_zyd( f_in, f_out )
1169
1170
1171	USE cpulog, &
1172	ONLY: cpu_log, log_point_s
1173
1174	USE indices, &
1175	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1176
1177	USE kinds
1178
1179	USE pegrid
1180
1181	USE transpose_indices, &
1182	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1183
1184	IMPLICIT NONE
1185
1186	INTEGER(iwp) :: i !<
1187	INTEGER(iwp) :: j !<
1188	INTEGER(iwp) :: k !<
1189	INTEGER(iwp) :: l !<
1190	INTEGER(iwp) :: m !<
1191	INTEGER(iwp) :: ys !<
1192
1193	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1194	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1195	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1196	REAL(wp) :: work(nnxnnynnz) !<
1197
1198	#if defined( __parallel )
1199
1200	!
1201	!-- Rearrange indices of input array in order to make data to be send
1202	!-- by MPI contiguous
1203	DO i = nxl, nxr
1204	DO j = nys, nyn
1205	DO k = 1, nz
1206	f_inv(j,i,k) = f_in(k,j,i)
1207	ENDDO
1208	ENDDO
1209	ENDDO
1210
1211	!
1212	!-- Move data to different array, because memory location of work1 is
1213	!-- needed further below (work1 = work2).
1214	!-- If the PE grid is one-dimensional along x, only local reordering
1215	!-- of the data is necessary and no transposition has to be done.
1216	IF ( pdims(2) == 1 ) THEN
1217	DO k = 1, nz
1218	DO i = nxl, nxr
1219	DO j = nys, nyn
1220	f_out(j,i,k) = f_inv(j,i,k)
1221	ENDDO
1222	ENDDO
1223	ENDDO
1224	RETURN
1225	ENDIF
1226
1227	!
1228	!-- Transpose array
1229	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1230	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1231	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1232	work(1), sendrecvcount_zyd, MPI_REAL, &
1233	comm1dy, ierr )
1234	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1235
1236	!
1237	!-- Reorder transposed array
1238	m = 0
1239	DO l = 0, pdims(2) - 1
1240	ys = 0 + l * nny
1241	DO k = nzb_yd, nzt_yd
1242	DO i = nxl_yd, nxr_yd
1243	DO j = ys, ys + nny - 1
1244	m = m + 1
1245	f_out(j,i,k) = work(m)
1246	ENDDO
1247	ENDDO
1248	ENDDO
1249	ENDDO
1250
1251	#endif
1252
1253	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |