Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4748

Last change on this file since 4748 was 4717, checked in by pavelkrc, 4 years ago
Fixes and optimizations of OpenMP parallelization, formatting of OpenMP directives
Property svn:keywords set to `Id`
File size: 46.3 KB

Line
1	!> @file transpose.f90
2	!--------------------------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
7	! (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
11	! Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
14	! <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2020 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------------------------!
18	!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4717 2020-09-30 22:27:40Z pavelkrc $
27	! Formatting of OpenMP directives (J. Resler)
28	!
29	! 4540 2020-05-18 15:23:29Z raasch
30	! File re-formatted to follow the PALM coding standard
31	!
32	! 4429 2020-02-27 15:24:30Z raasch
33	! Bugfix: cpp-directives added for serial mode
34	!
35	! 4415 2020-02-20 10:30:33Z raasch
36	! Bugfix for misplaced preprocessor directive
37	!
38	! 4370 2020-01-10 14:00:44Z raasch
39	! Vector array renamed
40	!
41	! 4366 2020-01-09 08:12:43Z raasch
42	! Modifications for NEC vectorization
43	!
44	! 4360 2020-01-07 11:25:50Z suehring
45	! Added missing OpenMP directives
46	!
47	! 4182 2019-08-22 15:20:23Z scharf
48	! Corrected "Former revisions" section
49	!
50	! 4171 2019-08-19 17:44:09Z gronemeier
51	! Loop reordering for performance optimization
52	!
53	! 3832 2019-03-28 13:16:58Z raasch
54	! Loop reordering for performance optimization
55	!
56	! 3694 2019-01-23 17:01:49Z knoop
57	! OpenACC port for SPEC
58	!
59	! Revision 1.1 1997/07/24 11:25:18 raasch
60	! Initial revision
61	!
62	!
63	! Description:
64	! ------------
65	!> Resorting data for the transposition from x to y. The transposition itself is carried out in
66	!> transpose_xy.
67	!--------------------------------------------------------------------------------------------------!
68
69	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
70
71	SUBROUTINE resort_for_xy( f_in, f_inv )
72
73
74	USE indices, &
75	ONLY: nx
76
77	USE kinds
78
79	USE transpose_indices, &
80	ONLY: nyn_x, &
81	nys_x, &
82	nzb_x, &
83	nzt_x
84
85	IMPLICIT NONE
86
87	INTEGER(iwp) :: i !<
88	INTEGER(iwp) :: j !<
89	INTEGER(iwp) :: k !<
90
91	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
92	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
93
94	!
95	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous
96	!$OMP PARALLEL PRIVATE ( i, j, k )
97	!$OMP DO
98	#if __acc_fft_device
99	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
100	!$ACC PRESENT(f_inv, f_in)
101	#endif
102	DO k = nzb_x, nzt_x
103	DO j = nys_x, nyn_x
104	DO i = 0, nx
105	f_inv(j,k,i) = f_in(i,j,k)
106	ENDDO
107	ENDDO
108	ENDDO
109	!$OMP END PARALLEL
110
111	END SUBROUTINE resort_for_xy
112
113
114	!--------------------------------------------------------------------------------------------------!
115	! Description:
116	! ------------
117	!> Transposition of input array (f_in) from x to y. For the input array, all elements along x reside
118	!> on the same PE, while after transposition, all elements along y reside on the same PE.
119	!--------------------------------------------------------------------------------------------------!
120	SUBROUTINE transpose_xy( f_inv, f_out )
121
122
123	#if defined( __parallel )
124	USE cpulog, &
125	ONLY: cpu_log, &
126	cpu_log_nowait, &
127	log_point_s
128	#endif
129
130	USE indices, &
131	ONLY: nx, &
132	ny
133
134	USE kinds
135
136	USE pegrid
137
138	USE transpose_indices, &
139	ONLY: nxl_y, &
140	nxr_y, &
141	nyn_x, &
142	nys_x, &
143	nzb_x, &
144	nzb_y, &
145	nzt_x, &
146	nzt_y
147
148	IMPLICIT NONE
149
150	INTEGER(iwp) :: i !<
151	INTEGER(iwp) :: j !<
152	INTEGER(iwp) :: k !<
153
154	#if defined( __parallel )
155	INTEGER(iwp) :: l !<
156	INTEGER(iwp) :: ys !<
157	#endif
158
159	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
160	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
161
162	#if defined( __parallel )
163	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
164	#if __acc_fft_device
165	!$ACC DECLARE CREATE(work)
166	#endif
167	#endif
168
169
170	IF ( numprocs /= 1 ) THEN
171
172	#if defined( __parallel )
173	!
174	!-- Transpose array
175	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
176
177	#if __acc_fft_device
178	#ifndef __cuda_aware_mpi
179	!$ACC UPDATE HOST(f_inv)
180	#else
181	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
182	#endif
183	#endif
184
185	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
186	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
187	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
188
189	#if __acc_fft_device
190	#ifndef __cuda_aware_mpi
191	!$ACC UPDATE DEVICE(work)
192	#else
193	!$ACC END HOST_DATA
194	#endif
195	#endif
196
197	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
198
199	!
200	!-- Reorder transposed array
201	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
202	DO l = 0, pdims(2) - 1
203	ys = 0 + l * ( nyn_x - nys_x + 1 )
204	#if __acc_fft_device
205	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
206	!$ACC PRESENT(f_out, work)
207	#endif
208	!$OMP DO
209	DO i = nxl_y, nxr_y
210	DO k = nzb_y, nzt_y
211	DO j = ys, ys + nyn_x - nys_x
212	f_out(j,i,k) = work(j-ys+1,k,i,l)
213	ENDDO
214	ENDDO
215	ENDDO
216	!$OMP END DO NOWAIT
217	ENDDO
218	!$OMP END PARALLEL
219	#endif
220
221	ELSE
222
223	!
224	!-- Reorder transposed array
225	!$OMP PARALLEL PRIVATE ( i, j, k )
226	!$OMP DO
227	#if __acc_fft_device
228	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
229	!$ACC PRESENT(f_out, f_inv)
230	#endif
231	DO k = nzb_y, nzt_y
232	DO i = nxl_y, nxr_y
233	DO j = 0, ny
234	f_out(j,i,k) = f_inv(j,k,i)
235	ENDDO
236	ENDDO
237	ENDDO
238	!$OMP END PARALLEL
239
240	ENDIF
241
242	END SUBROUTINE transpose_xy
243
244
245	!--------------------------------------------------------------------------------------------------!
246	! Description:
247	! ------------
248	!> Resorting data after the transposition from x to z. The transposition itself is carried out in
249	!> transpose_xz.
250	!--------------------------------------------------------------------------------------------------!
251	SUBROUTINE resort_for_xz( f_inv, f_out )
252
253
254	USE indices, &
255	ONLY: nxl, &
256	nxr, &
257	nyn, &
258	nys, &
259	nz
260
261	USE kinds
262
263	IMPLICIT NONE
264
265	INTEGER(iwp) :: i !<
266	INTEGER(iwp) :: j !<
267	INTEGER(iwp) :: k !<
268
269	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
270	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
271
272	!
273	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
274	!-- In case of parallel fft/transposition, scattered store is faster in backward direction!!!
275	!$OMP PARALLEL PRIVATE ( i, j, k )
276	!$OMP DO
277	#if __acc_fft_device
278	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
279	!$ACC PRESENT(f_out, f_inv)
280	#endif
281	DO i = nxl, nxr
282	DO j = nys, nyn
283	DO k = 1, nz
284	f_out(k,j,i) = f_inv(j,i,k)
285	ENDDO
286	ENDDO
287	ENDDO
288	!$OMP END PARALLEL
289
290	END SUBROUTINE resort_for_xz
291
292
293	!--------------------------------------------------------------------------------------------------!
294	! Description:
295	! ------------
296	!> Transposition of input array (f_in) from x to z. For the input array, all elements along x reside
297	!> on the same PE, while after transposition, all elements along z reside on the same PE.
298	!--------------------------------------------------------------------------------------------------!
299	SUBROUTINE transpose_xz( f_in, f_inv )
300
301	#if defined( __parallel )
302	USE cpulog, &
303	ONLY: cpu_log, &
304	cpu_log_nowait, &
305	log_point_s
306
307	USE fft_xy, &
308	ONLY: f_vec_x, &
309	temperton_fft_vec
310	#endif
311
312	USE indices, &
313	ONLY: nx, &
314	nxl, &
315	nxr, &
316	nyn, &
317	nys, &
318	nz
319
320	#if defined( __parallel )
321	USE indices, &
322	ONLY: nnx
323	#endif
324
325	USE kinds
326
327	USE pegrid
328
329	USE transpose_indices, &
330	ONLY: nyn_x, &
331	nys_x, &
332	nzb_x, &
333	nzt_x
334
335	IMPLICIT NONE
336
337	INTEGER(iwp) :: i !<
338	INTEGER(iwp) :: j !<
339	INTEGER(iwp) :: k !<
340
341	#if defined( __parallel )
342	INTEGER(iwp) :: l !<
343	INTEGER(iwp) :: mm !<
344	INTEGER(iwp) :: xs !<
345	#endif
346
347	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
348	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
349
350	#if defined( __parallel )
351	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
352	#if __acc_fft_device
353	!$ACC DECLARE CREATE(work)
354	#endif
355	#endif
356
357	!
358	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
359	!-- therefore no transposition has to be done.
360	IF ( pdims(1) /= 1 ) THEN
361
362	#if defined( __parallel )
363	!
364	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
365	!-- different array format (f_vec_x).
366	IF ( temperton_fft_vec ) THEN
367
368	DO l = 0, pdims(1) - 1
369	xs = 0 + l * nnx
370	DO k = nzb_x, nzt_x
371	DO i = xs, xs + nnx - 1
372	DO j = nys_x, nyn_x
373	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
374	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
375	ENDDO
376	ENDDO
377	ENDDO
378	ENDDO
379
380	ELSE
381
382	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
383	DO l = 0, pdims(1) - 1
384	xs = 0 + l * nnx
385	#if __acc_fft_device
386	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
387	!$ACC PRESENT(work, f_in)
388	#endif
389	!$OMP DO
390	DO k = nzb_x, nzt_x
391	DO i = xs, xs + nnx - 1
392	DO j = nys_x, nyn_x
393	work(j,i-xs+1,k,l) = f_in(i,j,k)
394	ENDDO
395	ENDDO
396	ENDDO
397	!$OMP END DO NOWAIT
398	ENDDO
399	!$OMP END PARALLEL
400
401	ENDIF
402
403	!
404	!-- Transpose array
405	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
406
407	#if __acc_fft_device
408	#ifndef __cuda_aware_mpi
409	!$ACC UPDATE HOST(work)
410	#else
411	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
412	#endif
413	#endif
414
415	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
416	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
417	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
418
419	#if __acc_fft_device
420	#ifndef __cuda_aware_mpi
421	!$ACC UPDATE DEVICE(f_inv)
422	#else
423	!$ACC END HOST_DATA
424	#endif
425	#endif
426
427	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
428	#endif
429
430	ELSE
431
432	!
433	!-- Reorder the array in a way that the z index is in first position
434	!$OMP PARALLEL PRIVATE ( i, j, k )
435	!$OMP DO
436	#if __acc_fft_device
437	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
438	!$ACC PRESENT(f_inv, f_in)
439	#endif
440	DO i = nxl, nxr
441	DO j = nys, nyn
442	DO k = 1, nz
443	f_inv(j,i,k) = f_in(i,j,k)
444	ENDDO
445	ENDDO
446	ENDDO
447	!$OMP END PARALLEL
448
449	ENDIF
450
451	END SUBROUTINE transpose_xz
452
453
454	!--------------------------------------------------------------------------------------------------!
455	! Description:
456	! ------------
457	!> Resorting data after the transposition from y to x. The transposition itself is carried out in
458	!> transpose_yx.
459	!--------------------------------------------------------------------------------------------------!
460	SUBROUTINE resort_for_yx( f_inv, f_out )
461
462
463	USE indices, &
464	ONLY: nx
465
466	USE kinds
467
468	USE transpose_indices, &
469	ONLY: nyn_x, &
470	nys_x, &
471	nzb_x, &
472	nzt_x
473
474	IMPLICIT NONE
475
476	INTEGER(iwp) :: i !<
477	INTEGER(iwp) :: j !<
478	INTEGER(iwp) :: k !<
479
480	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
481	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
482
483	!
484	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
485	!$OMP PARALLEL PRIVATE ( i, j, k )
486	!$OMP DO
487	#if __acc_fft_device
488	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
489	!$ACC PRESENT(f_out, f_inv)
490	#endif
491	DO k = nzb_x, nzt_x
492	DO j = nys_x, nyn_x
493	DO i = 0, nx
494	f_out(i,j,k) = f_inv(j,k,i)
495	ENDDO
496	ENDDO
497	ENDDO
498	!$OMP END PARALLEL
499
500	END SUBROUTINE resort_for_yx
501
502
503	!--------------------------------------------------------------------------------------------------!
504	! Description:
505	! ------------
506	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y
507	!> reside on the same PE, while after transposition, all elements along x reside on the same PE.
508	!--------------------------------------------------------------------------------------------------!
509	SUBROUTINE transpose_yx( f_in, f_inv )
510
511
512	#if defined( __parallel )
513	USE cpulog, &
514	ONLY: cpu_log, &
515	cpu_log_nowait, &
516	log_point_s
517	#endif
518
519	USE indices, &
520	ONLY: nx, &
521	ny
522
523	USE kinds
524
525	USE pegrid
526
527	USE transpose_indices, &
528	ONLY: nxl_y, &
529	nxr_y, &
530	nyn_x, &
531	nys_x, &
532	nzb_x, &
533	nzb_y, &
534	nzt_x, &
535	nzt_y
536
537	IMPLICIT NONE
538
539	INTEGER(iwp) :: i !<
540	INTEGER(iwp) :: j !<
541	INTEGER(iwp) :: k !<
542
543	#if defined( __parallel )
544	INTEGER(iwp) :: l !<
545	INTEGER(iwp) :: ys !<
546	#endif
547
548	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
549	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
550
551	#if defined( __parallel )
552	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
553	#if __acc_fft_device
554	!$ACC DECLARE CREATE(work)
555	#endif
556	#endif
557
558
559	IF ( numprocs /= 1 ) THEN
560
561	#if defined( __parallel )
562	!
563	!-- Reorder input array for transposition
564	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
565	DO l = 0, pdims(2) - 1
566	ys = 0 + l * ( nyn_x - nys_x + 1 )
567	#if __acc_fft_device
568	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
569	!$ACC PRESENT(work, f_in)
570	#endif
571	!$OMP DO
572	DO i = nxl_y, nxr_y
573	DO k = nzb_y, nzt_y
574	DO j = ys, ys + nyn_x - nys_x
575	work(j-ys+1,k,i,l) = f_in(j,i,k)
576	ENDDO
577	ENDDO
578	ENDDO
579	!$OMP END DO NOWAIT
580	ENDDO
581	!$OMP END PARALLEL
582
583	!
584	!-- Transpose array
585	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
586
587	#if __acc_fft_device
588	#ifndef __cuda_aware_mpi
589	!$ACC UPDATE HOST(work)
590	#else
591	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
592	#endif
593	#endif
594
595	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
596	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
597	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
598
599	#if __acc_fft_device
600	#ifndef __cuda_aware_mpi
601	!$ACC UPDATE DEVICE(f_inv)
602	#else
603	!$ACC END HOST_DATA
604	#endif
605	#endif
606
607	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
608	#endif
609
610	ELSE
611
612	!
613	!-- Reorder array f_in the same way as ALLTOALL did it.
614	!$OMP PARALLEL PRIVATE ( i, j, k )
615	!$OMP DO
616	#if __acc_fft_device
617	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
618	!$ACC PRESENT(f_inv, f_in)
619	#endif
620	DO i = nxl_y, nxr_y
621	DO k = nzb_y, nzt_y
622	DO j = 0, ny
623	f_inv(j,k,i) = f_in(j,i,k)
624	ENDDO
625	ENDDO
626	ENDDO
627	!$OMP END PARALLEL
628
629	ENDIF
630
631	END SUBROUTINE transpose_yx
632
633
634	!--------------------------------------------------------------------------------------------------!
635	! Description:
636	! ------------
637	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y reside
638	!> on the same PE, while after transposition, all elements along x reside on the same PE. This is a
639	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_yx).
640	!--------------------------------------------------------------------------------------------------!
641	#if defined( __parallel )
642	SUBROUTINE transpose_yxd( f_in, f_out )
643
644
645	USE cpulog, &
646	ONLY: cpu_log, &
647	log_point_s
648
649	USE indices, &
650	ONLY: nnx, &
651	nny, &
652	nnz, &
653	nx, &
654	nxl, &
655	nxr, &
656	nyn, &
657	nys, &
658	nz
659
660	USE kinds
661
662	USE pegrid
663
664	USE transpose_indices, &
665	ONLY: nyn_x, &
666	nys_x, &
667	nzb_x, &
668	nzt_x
669
670	IMPLICIT NONE
671
672	INTEGER(iwp) :: i !<
673	INTEGER(iwp) :: j !<
674	INTEGER(iwp) :: k !<
675	INTEGER(iwp) :: l !<
676	INTEGER(iwp) :: m !<
677	INTEGER(iwp) :: xs !<
678
679	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
680	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
681	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
682	REAL(wp) :: work(nnxnnynnz) !<
683
684	!
685	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
686	DO k = 1, nz
687	DO j = nys, nyn
688	DO i = nxl, nxr
689	f_inv(i,k,j) = f_in(k,j,i)
690	ENDDO
691	ENDDO
692	ENDDO
693
694	!
695	!-- Transpose array
696	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
697	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
698	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
699	work(1), sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
700	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
701
702	!
703	!-- Reorder transposed array
704	m = 0
705	DO l = 0, pdims(1) - 1
706	xs = 0 + l * nnx
707	DO j = nys_x, nyn_x
708	DO k = 1, nz
709	DO i = xs, xs + nnx - 1
710	m = m + 1
711	f_out(i,j,k) = work(m)
712	ENDDO
713	ENDDO
714	ENDDO
715	ENDDO
716
717	END SUBROUTINE transpose_yxd
718	#endif
719
720
721	!--------------------------------------------------------------------------------------------------!
722	! Description:
723	! ------------
724	!> Resorting data for the transposition from y to z. The transposition itself is carried out in
725	!> transpose_yz.
726	!--------------------------------------------------------------------------------------------------!
727	SUBROUTINE resort_for_yz( f_in, f_inv )
728
729
730	USE indices, &
731	ONLY: ny
732
733	USE kinds
734
735	USE transpose_indices, &
736	ONLY: nxl_y, &
737	nxr_y, &
738	nzb_y, &
739	nzt_y
740
741	IMPLICIT NONE
742
743	INTEGER(iwp) :: i !<
744	INTEGER(iwp) :: j !<
745	INTEGER(iwp) :: k !<
746
747	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
748	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
749
750	!
751	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
752	!$OMP PARALLEL PRIVATE ( i, j, k )
753	!$OMP DO
754	#if __acc_fft_device
755	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
756	!$ACC PRESENT(f_inv, f_in)
757	#endif
758	DO k = nzb_y, nzt_y
759	DO i = nxl_y, nxr_y
760	DO j = 0, ny
761	f_inv(i,k,j) = f_in(j,i,k)
762	ENDDO
763	ENDDO
764	ENDDO
765	!$OMP END PARALLEL
766
767	END SUBROUTINE resort_for_yz
768
769
770	!--------------------------------------------------------------------------------------------------!
771	! Description:
772	! ------------
773	!> Transposition of input array (f_in) from y to z. For the input array, all elements along y reside
774	!> on the same PE, while after transposition, all elements along z reside on the same PE.
775	!--------------------------------------------------------------------------------------------------!
776	SUBROUTINE transpose_yz( f_inv, f_out )
777
778
779	#if defined( __parallel )
780	USE cpulog, &
781	ONLY: cpu_log, &
782	cpu_log_nowait, &
783	log_point_s
784	#endif
785
786	USE indices, &
787	ONLY: ny, &
788	nz
789
790	USE kinds
791
792	USE pegrid
793
794	USE transpose_indices, &
795	ONLY: nxl_y, &
796	nxl_z, &
797	nxr_y, &
798	nxr_z, &
799	nyn_z, &
800	nys_z, &
801	nzb_y, &
802	nzt_y
803
804	IMPLICIT NONE
805
806	INTEGER(iwp) :: i !<
807	INTEGER(iwp) :: j !<
808	INTEGER(iwp) :: k !<
809
810	#if defined( __parallel )
811	INTEGER(iwp) :: l !<
812	INTEGER(iwp) :: zs !<
813	#endif
814
815	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
816	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
817
818	#if defined( __parallel )
819	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
820	#if __acc_fft_device
821	!$ACC DECLARE CREATE(work)
822	#endif
823	#endif
824
825
826	!
827	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
828	!-- transposition has to be done.
829	IF ( pdims(1) == 1 ) THEN
830
831	!$OMP PARALLEL PRIVATE ( i, j, k )
832	!$OMP DO
833	#if __acc_fft_device
834	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
835	!$ACC PRESENT(f_out, f_inv)
836	#endif
837	DO j = 0, ny
838	DO k = nzb_y, nzt_y
839	DO i = nxl_y, nxr_y
840	f_out(i,j,k) = f_inv(i,k,j)
841	ENDDO
842	ENDDO
843	ENDDO
844	!$OMP END PARALLEL
845
846	ELSE
847
848	#if defined( __parallel )
849	!
850	!-- Transpose array
851	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
852
853	#if __acc_fft_device
854	#ifndef __cuda_aware_mpi
855	!$ACC UPDATE HOST(f_inv)
856	#else
857	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
858	#endif
859	#endif
860
861	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
862	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
863	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
864
865	#if __acc_fft_device
866	#ifndef __cuda_aware_mpi
867	!$ACC UPDATE DEVICE(work)
868	#else
869	!$ACC END HOST_DATA
870	#endif
871	#endif
872
873	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
874
875	!
876	!-- Reorder transposed array
877	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
878	DO l = 0, pdims(1) - 1
879	zs = 1 + l * ( nzt_y - nzb_y + 1 )
880	#if __acc_fft_device
881	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
882	!$ACC PRESENT(f_out, work)
883	#endif
884	!$OMP DO
885	DO j = nys_z, nyn_z
886	DO k = zs, zs + nzt_y - nzb_y
887	DO i = nxl_z, nxr_z
888	f_out(i,j,k) = work(i,k-zs+1,j,l)
889	ENDDO
890	ENDDO
891	ENDDO
892	!$OMP END DO NOWAIT
893	ENDDO
894	!$OMP END PARALLEL
895	#endif
896
897	ENDIF
898
899	END SUBROUTINE transpose_yz
900
901
902	!--------------------------------------------------------------------------------------------------!
903	! Description:
904	! ------------
905	!> Resorting data for the transposition from z to x. The transposition itself is carried out in
906	!> transpose_zx.
907	!--------------------------------------------------------------------------------------------------!
908	SUBROUTINE resort_for_zx( f_in, f_inv )
909
910
911	USE indices, &
912	ONLY: nxl, &
913	nxr, &
914	nyn, &
915	nys, &
916	nz
917
918	USE kinds
919
920	IMPLICIT NONE
921
922	INTEGER(iwp) :: i !<
923	INTEGER(iwp) :: j !<
924	INTEGER(iwp) :: k !<
925
926	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
927	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
928
929	!
930	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
931	!$OMP PARALLEL PRIVATE ( i, j, k )
932	!$OMP DO
933	#if __acc_fft_device
934	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
935	!$ACC PRESENT(f_in, f_inv)
936	#endif
937	DO i = nxl, nxr
938	DO j = nys, nyn
939	DO k = 1,nz
940	f_inv(j,i,k) = f_in(k,j,i)
941	ENDDO
942	ENDDO
943	ENDDO
944	!$OMP END PARALLEL
945
946	END SUBROUTINE resort_for_zx
947
948
949	!--------------------------------------------------------------------------------------------------!
950	! Description:
951	! ------------
952	!> Transposition of input array (f_in) from z to x. For the input array, all elements along z reside
953	!> on the same PE, while after transposition, all elements along x reside on the same PE.
954	!--------------------------------------------------------------------------------------------------!
955	SUBROUTINE transpose_zx( f_inv, f_out )
956
957
958	#if defined( __parallel )
959	USE cpulog, &
960	ONLY: cpu_log, &
961	cpu_log_nowait, &
962	log_point_s
963
964	USE fft_xy, &
965	ONLY: f_vec_x, &
966	temperton_fft_vec
967	#endif
968
969	USE indices, &
970	ONLY: nx, &
971	nxl, &
972	nxr, &
973	nyn, &
974	nys, &
975	nz
976
977	#if defined( __parallel )
978	USE indices, &
979	ONLY: nnx
980	#endif
981
982	USE kinds
983
984	USE pegrid
985
986	USE transpose_indices, &
987	ONLY: nyn_x, &
988	nys_x, &
989	nzb_x, &
990	nzt_x
991
992	IMPLICIT NONE
993
994	INTEGER(iwp) :: i !<
995	INTEGER(iwp) :: j !<
996	INTEGER(iwp) :: k !<
997
998	#if defined( __parallel )
999	INTEGER(iwp) :: l !<
1000	INTEGER(iwp) :: mm !<
1001	INTEGER(iwp) :: xs !<
1002	#endif
1003
1004	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1005	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
1006
1007	#if defined( __parallel )
1008	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
1009	#if __acc_fft_device
1010	!$ACC DECLARE CREATE(work)
1011	#endif
1012	#endif
1013
1014
1015	!
1016	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
1017	!-- transposition has to be done.
1018	IF ( pdims(1) == 1 ) THEN
1019
1020	!$OMP PARALLEL PRIVATE ( i, j, k )
1021	!$OMP DO
1022	#if __acc_fft_device
1023	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1024	!$ACC PRESENT(f_out, f_inv)
1025	#endif
1026	DO k = 1, nz
1027	DO i = nxl, nxr
1028	DO j = nys, nyn
1029	f_out(i,j,k) = f_inv(j,i,k)
1030	ENDDO
1031	ENDDO
1032	ENDDO
1033	!$OMP END PARALLEL
1034
1035	ELSE
1036
1037	#if defined( __parallel )
1038	!
1039	!-- Transpose array
1040	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1041
1042	#if __acc_fft_device
1043	#ifndef __cuda_aware_mpi
1044	!$ACC UPDATE HOST(f_inv)
1045	#else
1046	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1047	#endif
1048	#endif
1049
1050	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1051	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
1052	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
1053
1054	#if __acc_fft_device
1055	#ifndef __cuda_aware_mpi
1056	!$ACC UPDATE DEVICE(work)
1057	#else
1058	!$ACC END HOST_DATA
1059	#endif
1060	#endif
1061
1062	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1063
1064	!
1065	!-- Reorder transposed array.
1066	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
1067	!-- saves additional data copy in fft_x.
1068	IF ( temperton_fft_vec ) THEN
1069
1070	DO l = 0, pdims(1) - 1
1071	xs = 0 + l * nnx
1072	DO k = nzb_x, nzt_x
1073	DO i = xs, xs + nnx - 1
1074	DO j = nys_x, nyn_x
1075	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
1076	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
1077	ENDDO
1078	ENDDO
1079	ENDDO
1080	ENDDO
1081
1082	ELSE
1083
1084	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
1085	DO l = 0, pdims(1) - 1
1086	xs = 0 + l * nnx
1087	#if __acc_fft_device
1088	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1089	!$ACC PRESENT(f_out, work)
1090	#endif
1091	!$OMP DO
1092	DO k = nzb_x, nzt_x
1093	DO i = xs, xs + nnx - 1
1094	DO j = nys_x, nyn_x
1095	f_out(i,j,k) = work(j,i-xs+1,k,l)
1096	ENDDO
1097	ENDDO
1098	ENDDO
1099	!$OMP END DO NOWAIT
1100	ENDDO
1101	!$OMP END PARALLEL
1102
1103	ENDIF
1104
1105	#endif
1106
1107	ENDIF
1108
1109	END SUBROUTINE transpose_zx
1110
1111
1112	!--------------------------------------------------------------------------------------------------!
1113	! Description:
1114	! ------------
1115	!> Resorting data after the transposition from z to y. The transposition itself is carried out in
1116	!> transpose_zy.
1117	!--------------------------------------------------------------------------------------------------!
1118	SUBROUTINE resort_for_zy( f_inv, f_out )
1119
1120
1121	USE indices, &
1122	ONLY: ny
1123
1124	USE kinds
1125
1126	USE transpose_indices, &
1127	ONLY: nxl_y, &
1128	nxr_y, &
1129	nzb_y, &
1130	nzt_y
1131
1132	IMPLICIT NONE
1133
1134	INTEGER(iwp) :: i !<
1135	INTEGER(iwp) :: j !<
1136	INTEGER(iwp) :: k !<
1137
1138	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1139	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1140
1141	!
1142	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
1143	!$OMP PARALLEL PRIVATE ( i, j, k )
1144	!$OMP DO
1145	#if __acc_fft_device
1146	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1147	!$ACC PRESENT(f_out, f_inv)
1148	#endif
1149	DO k = nzb_y, nzt_y
1150	DO i = nxl_y, nxr_y
1151	DO j = 0, ny
1152	f_out(j,i,k) = f_inv(i,k,j)
1153	ENDDO
1154	ENDDO
1155	ENDDO
1156	!$OMP END PARALLEL
1157
1158	END SUBROUTINE resort_for_zy
1159
1160
1161	!--------------------------------------------------------------------------------------------------!
1162	! Description:cpu_log_nowait
1163	! ------------
1164	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
1165	!> on the same PE, while after transposition, all elements along y reside on the same PE.
1166	!--------------------------------------------------------------------------------------------------!
1167	SUBROUTINE transpose_zy( f_in, f_inv )
1168
1169
1170	#if defined( __parallel )
1171	USE cpulog, &
1172	ONLY: cpu_log, &
1173	cpu_log_nowait, &
1174	log_point_s
1175	#endif
1176
1177	USE indices, &
1178	ONLY: ny, &
1179	nz
1180
1181	USE kinds
1182
1183	USE pegrid
1184
1185	USE transpose_indices, &
1186	ONLY: nxl_y, &
1187	nxl_z, &
1188	nxr_y, &
1189	nxr_z, &
1190	nyn_z, &
1191	nys_z, &
1192	nzb_y, &
1193	nzt_y
1194
1195	IMPLICIT NONE
1196
1197	INTEGER(iwp) :: i !<
1198	INTEGER(iwp) :: j !<
1199	INTEGER(iwp) :: k !<
1200
1201	#if defined( __parallel )
1202	INTEGER(iwp) :: l !<
1203	INTEGER(iwp) :: zs !<
1204	#endif
1205
1206	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1207	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1208
1209	#if defined( __parallel )
1210	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1211	#if __acc_fft_device
1212	!$ACC DECLARE CREATE(work)
1213	#endif
1214	#endif
1215
1216	!
1217	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
1218	!-- therefore no transposition has to be done.
1219	IF ( pdims(1) /= 1 ) THEN
1220
1221	#if defined( __parallel )
1222	!
1223	!-- Reorder input array for transposition
1224	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1225	DO l = 0, pdims(1) - 1
1226	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1227	#if __acc_fft_device
1228	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1229	!$ACC PRESENT(work, f_in)
1230	#endif
1231	!$OMP DO
1232	DO j = nys_z, nyn_z
1233	DO k = zs, zs + nzt_y - nzb_y
1234	DO i = nxl_z, nxr_z
1235	work(i,k-zs+1,j,l) = f_in(i,j,k)
1236	ENDDO
1237	ENDDO
1238	ENDDO
1239	!$OMP END DO NOWAIT
1240	ENDDO
1241	!$OMP END PARALLEL
1242
1243	!
1244	!-- Transpose array
1245	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1246
1247	#if __acc_fft_device
1248	#ifndef __cuda_aware_mpi
1249	!$ACC UPDATE HOST(work)
1250	#else
1251	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1252	#endif
1253	#endif
1254
1255	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1256	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1257	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
1258
1259	#if __acc_fft_device
1260	#ifndef __cuda_aware_mpi
1261	!$ACC UPDATE DEVICE(f_inv)
1262	#else
1263	!$ACC END HOST_DATA
1264	#endif
1265	#endif
1266
1267	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1268	#endif
1269
1270	ELSE
1271	!
1272	!-- Reorder the array in the same way like ALLTOALL did it
1273	!$OMP PARALLEL PRIVATE ( i, j, k )
1274	!$OMP DO
1275	#if __acc_fft_device
1276	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1277	!$ACC PRESENT(f_inv, f_in)
1278	#endif
1279	DO k = nzb_y, nzt_y
1280	DO j = 0, ny
1281	DO i = nxl_y, nxr_y
1282	f_inv(i,k,j) = f_in(i,j,k)
1283	ENDDO
1284	ENDDO
1285	ENDDO
1286	!$OMP END PARALLEL
1287
1288	ENDIF
1289
1290	END SUBROUTINE transpose_zy
1291
1292
1293	!--------------------------------------------------------------------------------------------------!
1294	! Description:
1295	! ------------
1296	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
1297	!> on the same PE, while after transposition, all elements along y reside on the same PE. This is a
1298	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_zy).
1299	!--------------------------------------------------------------------------------------------------!
1300	#if defined( __parallel )
1301	SUBROUTINE transpose_zyd( f_in, f_out )
1302
1303
1304	USE cpulog, &
1305	ONLY: cpu_log, &
1306	log_point_s
1307
1308	USE indices, &
1309	ONLY: nnx, &
1310	nny, &
1311	nnz, &
1312	nxl, &
1313	nxr, &
1314	nyn, &
1315	nys, &
1316	ny, &
1317	nz
1318
1319	USE kinds
1320
1321	USE pegrid
1322
1323	USE transpose_indices, &
1324	ONLY: nxl_yd, &
1325	nxr_yd, &
1326	nzb_yd, &
1327	nzt_yd
1328
1329	IMPLICIT NONE
1330
1331	INTEGER(iwp) :: i !<
1332	INTEGER(iwp) :: j !<
1333	INTEGER(iwp) :: k !<
1334	INTEGER(iwp) :: l !<
1335	INTEGER(iwp) :: m !<
1336	INTEGER(iwp) :: ys !<
1337
1338	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1339	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1340	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1341	REAL(wp) :: work(nnxnnynnz) !<
1342
1343	!
1344	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
1345	DO i = nxl, nxr
1346	DO j = nys, nyn
1347	DO k = 1, nz
1348	f_inv(j,i,k) = f_in(k,j,i)
1349	ENDDO
1350	ENDDO
1351	ENDDO
1352
1353	!
1354	!-- Move data to different array, because memory location of work1 is needed further below
1355	!-- (work1 = work2). If the PE grid is one-dimensional along x, only local reordering of the data is
1356	!-- necessary and no transposition has to be done.
1357	IF ( pdims(2) == 1 ) THEN
1358	DO k = 1, nz
1359	DO i = nxl, nxr
1360	DO j = nys, nyn
1361	f_out(j,i,k) = f_inv(j,i,k)
1362	ENDDO
1363	ENDDO
1364	ENDDO
1365	RETURN
1366	ENDIF
1367
1368	!
1369	!-- Transpose array
1370	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1371	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1372	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1373	work(1), sendrecvcount_zyd, MPI_REAL, comm1dy, ierr )
1374	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1375
1376	!
1377	!-- Reorder transposed array
1378	m = 0
1379	DO l = 0, pdims(2) - 1
1380	ys = 0 + l * nny
1381	DO k = nzb_yd, nzt_yd
1382	DO i = nxl_yd, nxr_yd
1383	DO j = ys, ys + nny - 1
1384	m = m + 1
1385	f_out(j,i,k) = work(m)
1386	ENDDO
1387	ENDDO
1388	ENDDO
1389	ENDDO
1390
1391	END SUBROUTINE transpose_zyd
1392	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |