Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4698

Last change on this file since 4698 was 4540, checked in by raasch, 5 years ago
files re-formatted to follow the PALM coding standard
Property svn:keywords set to `Id`
File size: 46.1 KB

Line
1	!> @file transpose.f90
2	!--------------------------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
7	! (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
11	! Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
14	! <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2020 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------------------------!
18	!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4540 2020-05-18 15:23:29Z maronga $
27	! File re-formatted to follow the PALM coding standard
28	!
29	!
30	! 4429 2020-02-27 15:24:30Z raasch
31	! Bugfix: cpp-directives added for serial mode
32	!
33	! 4415 2020-02-20 10:30:33Z raasch
34	! Bugfix for misplaced preprocessor directive
35	!
36	! 4370 2020-01-10 14:00:44Z raasch
37	! Vector array renamed
38	!
39	! 4366 2020-01-09 08:12:43Z raasch
40	! Modifications for NEC vectorization
41	!
42	! 4360 2020-01-07 11:25:50Z suehring
43	! Added missing OpenMP directives
44	!
45	! 4182 2019-08-22 15:20:23Z scharf
46	! Corrected "Former revisions" section
47	!
48	! 4171 2019-08-19 17:44:09Z gronemeier
49	! Loop reordering for performance optimization
50	!
51	! 3832 2019-03-28 13:16:58Z raasch
52	! Loop reordering for performance optimization
53	!
54	! 3694 2019-01-23 17:01:49Z knoop
55	! OpenACC port for SPEC
56	!
57	! Revision 1.1 1997/07/24 11:25:18 raasch
58	! Initial revision
59	!
60	!
61	! Description:
62	! ------------
63	!> Resorting data for the transposition from x to y. The transposition itself is carried out in
64	!> transpose_xy.
65	!--------------------------------------------------------------------------------------------------!
66
67	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
68
69	SUBROUTINE resort_for_xy( f_in, f_inv )
70
71
72	USE indices, &
73	ONLY: nx
74
75	USE kinds
76
77	USE transpose_indices, &
78	ONLY: nyn_x, &
79	nys_x, &
80	nzb_x, &
81	nzt_x
82
83	IMPLICIT NONE
84
85	INTEGER(iwp) :: i !<
86	INTEGER(iwp) :: j !<
87	INTEGER(iwp) :: k !<
88
89	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
90	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
91
92	!
93	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous
94	!$OMP PARALLEL PRIVATE ( i, j, k )
95	!$OMP DO
96	#if __acc_fft_device
97	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
98	!$ACC PRESENT(f_inv, f_in)
99	#endif
100	DO k = nzb_x, nzt_x
101	DO j = nys_x, nyn_x
102	DO i = 0, nx
103	f_inv(j,k,i) = f_in(i,j,k)
104	ENDDO
105	ENDDO
106	ENDDO
107	!$OMP END PARALLEL
108
109	END SUBROUTINE resort_for_xy
110
111
112	!--------------------------------------------------------------------------------------------------!
113	! Description:
114	! ------------
115	!> Transposition of input array (f_in) from x to y. For the input array, all elements along x reside
116	!> on the same PE, while after transposition, all elements along y reside on the same PE.
117	!--------------------------------------------------------------------------------------------------!
118	SUBROUTINE transpose_xy( f_inv, f_out )
119
120
121	#if defined( __parallel )
122	USE cpulog, &
123	ONLY: cpu_log, &
124	cpu_log_nowait, &
125	log_point_s
126	#endif
127
128	USE indices, &
129	ONLY: nx, &
130	ny
131
132	USE kinds
133
134	USE pegrid
135
136	USE transpose_indices, &
137	ONLY: nxl_y, &
138	nxr_y, &
139	nyn_x, &
140	nys_x, &
141	nzb_x, &
142	nzb_y, &
143	nzt_x, &
144	nzt_y
145
146	IMPLICIT NONE
147
148	INTEGER(iwp) :: i !<
149	INTEGER(iwp) :: j !<
150	INTEGER(iwp) :: k !<
151
152	#if defined( __parallel )
153	INTEGER(iwp) :: l !<
154	INTEGER(iwp) :: ys !<
155	#endif
156
157	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
158	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
159
160	#if defined( __parallel )
161	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
162	#if __acc_fft_device
163	!$ACC DECLARE CREATE(work)
164	#endif
165	#endif
166
167
168	IF ( numprocs /= 1 ) THEN
169
170	#if defined( __parallel )
171	!
172	!-- Transpose array
173	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
174
175	#if __acc_fft_device
176	#ifndef __cuda_aware_mpi
177	!$ACC UPDATE HOST(f_inv)
178	#else
179	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
180	#endif
181	#endif
182
183	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
184	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
185	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
186
187	#if __acc_fft_device
188	#ifndef __cuda_aware_mpi
189	!$ACC UPDATE DEVICE(work)
190	#else
191	!$ACC END HOST_DATA
192	#endif
193	#endif
194
195	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
196
197	!
198	!-- Reorder transposed array
199	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
200	DO l = 0, pdims(2) - 1
201	ys = 0 + l * ( nyn_x - nys_x + 1 )
202	#if __acc_fft_device
203	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
204	!$ACC PRESENT(f_out, work)
205	#endif
206	!$OMP DO
207	DO i = nxl_y, nxr_y
208	DO k = nzb_y, nzt_y
209	DO j = ys, ys + nyn_x - nys_x
210	f_out(j,i,k) = work(j-ys+1,k,i,l)
211	ENDDO
212	ENDDO
213	ENDDO
214	!$OMP END DO NOWAIT
215	ENDDO
216	!$OMP END PARALLEL
217	#endif
218
219	ELSE
220
221	!
222	!-- Reorder transposed array
223	!$OMP PARALLEL PRIVATE ( i, j, k )
224	!$OMP DO
225	#if __acc_fft_device
226	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
227	!$ACC PRESENT(f_out, f_inv)
228	#endif
229	DO k = nzb_y, nzt_y
230	DO i = nxl_y, nxr_y
231	DO j = 0, ny
232	f_out(j,i,k) = f_inv(j,k,i)
233	ENDDO
234	ENDDO
235	ENDDO
236	!$OMP END PARALLEL
237
238	ENDIF
239
240	END SUBROUTINE transpose_xy
241
242
243	!--------------------------------------------------------------------------------------------------!
244	! Description:
245	! ------------
246	!> Resorting data after the transposition from x to z. The transposition itself is carried out in
247	!> transpose_xz.
248	!--------------------------------------------------------------------------------------------------!
249	SUBROUTINE resort_for_xz( f_inv, f_out )
250
251
252	USE indices, &
253	ONLY: nxl, &
254	nxr, &
255	nyn, &
256	nys, &
257	nz
258
259	USE kinds
260
261	IMPLICIT NONE
262
263	INTEGER(iwp) :: i !<
264	INTEGER(iwp) :: j !<
265	INTEGER(iwp) :: k !<
266
267	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
268	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
269
270	!
271	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
272	!-- In case of parallel fft/transposition, scattered store is faster in backward direction!!!
273	!$OMP PARALLEL PRIVATE ( i, j, k )
274	!$OMP DO
275	#if __acc_fft_device
276	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
277	!$ACC PRESENT(f_out, f_inv)
278	#endif
279	DO i = nxl, nxr
280	DO j = nys, nyn
281	DO k = 1, nz
282	f_out(k,j,i) = f_inv(j,i,k)
283	ENDDO
284	ENDDO
285	ENDDO
286	!$OMP END PARALLEL
287
288	END SUBROUTINE resort_for_xz
289
290
291	!--------------------------------------------------------------------------------------------------!
292	! Description:
293	! ------------
294	!> Transposition of input array (f_in) from x to z. For the input array, all elements along x reside
295	!> on the same PE, while after transposition, all elements along z reside on the same PE.
296	!--------------------------------------------------------------------------------------------------!
297	SUBROUTINE transpose_xz( f_in, f_inv )
298
299	#if defined( __parallel )
300	USE cpulog, &
301	ONLY: cpu_log, &
302	cpu_log_nowait, &
303	log_point_s
304
305	USE fft_xy, &
306	ONLY: f_vec_x, &
307	temperton_fft_vec
308	#endif
309
310	USE indices, &
311	ONLY: nx, &
312	nxl, &
313	nxr, &
314	nyn, &
315	nys, &
316	nz
317
318	#if defined( __parallel )
319	USE indices, &
320	ONLY: nnx
321	#endif
322
323	USE kinds
324
325	USE pegrid
326
327	USE transpose_indices, &
328	ONLY: nyn_x, &
329	nys_x, &
330	nzb_x, &
331	nzt_x
332
333	IMPLICIT NONE
334
335	INTEGER(iwp) :: i !<
336	INTEGER(iwp) :: j !<
337	INTEGER(iwp) :: k !<
338
339	#if defined( __parallel )
340	INTEGER(iwp) :: l !<
341	INTEGER(iwp) :: mm !<
342	INTEGER(iwp) :: xs !<
343	#endif
344
345	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
346	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
347
348	#if defined( __parallel )
349	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
350	#if __acc_fft_device
351	!$ACC DECLARE CREATE(work)
352	#endif
353	#endif
354
355	!
356	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
357	!-- therefore no transposition has to be done.
358	IF ( pdims(1) /= 1 ) THEN
359
360	#if defined( __parallel )
361	!
362	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
363	!-- different array format (f_vec_x).
364	IF ( temperton_fft_vec ) THEN
365
366	DO l = 0, pdims(1) - 1
367	xs = 0 + l * nnx
368	DO k = nzb_x, nzt_x
369	DO i = xs, xs + nnx - 1
370	DO j = nys_x, nyn_x
371	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
372	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
373	ENDDO
374	ENDDO
375	ENDDO
376	ENDDO
377
378	ELSE
379
380	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
381	DO l = 0, pdims(1) - 1
382	xs = 0 + l * nnx
383	#if __acc_fft_device
384	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
385	!$ACC PRESENT(work, f_in)
386	#endif
387	!$OMP DO
388	DO k = nzb_x, nzt_x
389	DO i = xs, xs + nnx - 1
390	DO j = nys_x, nyn_x
391	work(j,i-xs+1,k,l) = f_in(i,j,k)
392	ENDDO
393	ENDDO
394	ENDDO
395	!$OMP END DO NOWAIT
396	ENDDO
397	!$OMP END PARALLEL
398
399	ENDIF
400
401	!
402	!-- Transpose array
403	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
404
405	#if __acc_fft_device
406	#ifndef __cuda_aware_mpi
407	!$ACC UPDATE HOST(work)
408	#else
409	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
410	#endif
411	#endif
412
413	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
414	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
415	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
416
417	#if __acc_fft_device
418	#ifndef __cuda_aware_mpi
419	!$ACC UPDATE DEVICE(f_inv)
420	#else
421	!$ACC END HOST_DATA
422	#endif
423	#endif
424
425	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
426	#endif
427
428	ELSE
429
430	!
431	!-- Reorder the array in a way that the z index is in first position
432	!$OMP PARALLEL PRIVATE ( i, j, k )
433	!$OMP DO
434	#if __acc_fft_device
435	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
436	!$ACC PRESENT(f_inv, f_in)
437	#endif
438	DO i = nxl, nxr
439	DO j = nys, nyn
440	DO k = 1, nz
441	f_inv(j,i,k) = f_in(i,j,k)
442	ENDDO
443	ENDDO
444	ENDDO
445	!$OMP END PARALLEL
446
447	ENDIF
448
449	END SUBROUTINE transpose_xz
450
451
452	!--------------------------------------------------------------------------------------------------!
453	! Description:
454	! ------------
455	!> Resorting data after the transposition from y to x. The transposition itself is carried out in
456	!> transpose_yx.
457	!--------------------------------------------------------------------------------------------------!
458	SUBROUTINE resort_for_yx( f_inv, f_out )
459
460
461	USE indices, &
462	ONLY: nx
463
464	USE kinds
465
466	USE transpose_indices, &
467	ONLY: nyn_x, &
468	nys_x, &
469	nzb_x, &
470	nzt_x
471
472	IMPLICIT NONE
473
474	INTEGER(iwp) :: i !<
475	INTEGER(iwp) :: j !<
476	INTEGER(iwp) :: k !<
477
478	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
479	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
480
481	!
482	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
483	!$OMP PARALLEL PRIVATE ( i, j, k )
484	!$OMP DO
485	#if __acc_fft_device
486	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
487	!$ACC PRESENT(f_out, f_inv)
488	#endif
489	DO k = nzb_x, nzt_x
490	DO j = nys_x, nyn_x
491	DO i = 0, nx
492	f_out(i,j,k) = f_inv(j,k,i)
493	ENDDO
494	ENDDO
495	ENDDO
496	!$OMP END PARALLEL
497
498	END SUBROUTINE resort_for_yx
499
500
501	!--------------------------------------------------------------------------------------------------!
502	! Description:
503	! ------------
504	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y
505	!> reside on the same PE, while after transposition, all elements along x reside on the same PE.
506	!--------------------------------------------------------------------------------------------------!
507	SUBROUTINE transpose_yx( f_in, f_inv )
508
509
510	#if defined( __parallel )
511	USE cpulog, &
512	ONLY: cpu_log, &
513	cpu_log_nowait, &
514	log_point_s
515	#endif
516
517	USE indices, &
518	ONLY: nx, &
519	ny
520
521	USE kinds
522
523	USE pegrid
524
525	USE transpose_indices, &
526	ONLY: nxl_y, &
527	nxr_y, &
528	nyn_x, &
529	nys_x, &
530	nzb_x, &
531	nzb_y, &
532	nzt_x, &
533	nzt_y
534
535	IMPLICIT NONE
536
537	INTEGER(iwp) :: i !<
538	INTEGER(iwp) :: j !<
539	INTEGER(iwp) :: k !<
540
541	#if defined( __parallel )
542	INTEGER(iwp) :: l !<
543	INTEGER(iwp) :: ys !<
544	#endif
545
546	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
547	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
548
549	#if defined( __parallel )
550	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
551	#if __acc_fft_device
552	!$ACC DECLARE CREATE(work)
553	#endif
554	#endif
555
556
557	IF ( numprocs /= 1 ) THEN
558
559	#if defined( __parallel )
560	!
561	!-- Reorder input array for transposition
562	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
563	DO l = 0, pdims(2) - 1
564	ys = 0 + l * ( nyn_x - nys_x + 1 )
565	#if __acc_fft_device
566	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
567	!$ACC PRESENT(work, f_in)
568	#endif
569	!$OMP DO
570	DO i = nxl_y, nxr_y
571	DO k = nzb_y, nzt_y
572	DO j = ys, ys + nyn_x - nys_x
573	work(j-ys+1,k,i,l) = f_in(j,i,k)
574	ENDDO
575	ENDDO
576	ENDDO
577	!$OMP END DO NOWAIT
578	ENDDO
579	!$OMP END PARALLEL
580
581	!
582	!-- Transpose array
583	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
584
585	#if __acc_fft_device
586	#ifndef __cuda_aware_mpi
587	!$ACC UPDATE HOST(work)
588	#else
589	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
590	#endif
591	#endif
592
593	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
594	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
595	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
596
597	#if __acc_fft_device
598	#ifndef __cuda_aware_mpi
599	!$ACC UPDATE DEVICE(f_inv)
600	#else
601	!$ACC END HOST_DATA
602	#endif
603	#endif
604
605	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
606	#endif
607
608	ELSE
609
610	!
611	!-- Reorder array f_in the same way as ALLTOALL did it.
612	!$OMP PARALLEL PRIVATE ( i, j, k )
613	!$OMP DO
614	#if __acc_fft_device
615	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
616	!$ACC PRESENT(f_inv, f_in)
617	#endif
618	DO i = nxl_y, nxr_y
619	DO k = nzb_y, nzt_y
620	DO j = 0, ny
621	f_inv(j,k,i) = f_in(j,i,k)
622	ENDDO
623	ENDDO
624	ENDDO
625	!$OMP END PARALLEL
626
627	ENDIF
628
629	END SUBROUTINE transpose_yx
630
631
632	!--------------------------------------------------------------------------------------------------!
633	! Description:
634	! ------------
635	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y reside
636	!> on the same PE, while after transposition, all elements along x reside on the same PE. This is a
637	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_yx).
638	!--------------------------------------------------------------------------------------------------!
639	#if defined( __parallel )
640	SUBROUTINE transpose_yxd( f_in, f_out )
641
642
643	USE cpulog, &
644	ONLY: cpu_log, &
645	log_point_s
646
647	USE indices, &
648	ONLY: nnx, &
649	nny, &
650	nnz, &
651	nx, &
652	nxl, &
653	nxr, &
654	nyn, &
655	nys, &
656	nz
657
658	USE kinds
659
660	USE pegrid
661
662	USE transpose_indices, &
663	ONLY: nyn_x, &
664	nys_x, &
665	nzb_x, &
666	nzt_x
667
668	IMPLICIT NONE
669
670	INTEGER(iwp) :: i !<
671	INTEGER(iwp) :: j !<
672	INTEGER(iwp) :: k !<
673	INTEGER(iwp) :: l !<
674	INTEGER(iwp) :: m !<
675	INTEGER(iwp) :: xs !<
676
677	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
678	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
679	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
680	REAL(wp) :: work(nnxnnynnz) !<
681
682	!
683	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
684	DO k = 1, nz
685	DO j = nys, nyn
686	DO i = nxl, nxr
687	f_inv(i,k,j) = f_in(k,j,i)
688	ENDDO
689	ENDDO
690	ENDDO
691
692	!
693	!-- Transpose array
694	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
695	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
696	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
697	work(1), sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
698	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
699
700	!
701	!-- Reorder transposed array
702	m = 0
703	DO l = 0, pdims(1) - 1
704	xs = 0 + l * nnx
705	DO j = nys_x, nyn_x
706	DO k = 1, nz
707	DO i = xs, xs + nnx - 1
708	m = m + 1
709	f_out(i,j,k) = work(m)
710	ENDDO
711	ENDDO
712	ENDDO
713	ENDDO
714
715	END SUBROUTINE transpose_yxd
716	#endif
717
718
719	!--------------------------------------------------------------------------------------------------!
720	! Description:
721	! ------------
722	!> Resorting data for the transposition from y to z. The transposition itself is carried out in
723	!> transpose_yz.
724	!--------------------------------------------------------------------------------------------------!
725	SUBROUTINE resort_for_yz( f_in, f_inv )
726
727
728	USE indices, &
729	ONLY: ny
730
731	USE kinds
732
733	USE transpose_indices, &
734	ONLY: nxl_y, &
735	nxr_y, &
736	nzb_y, &
737	nzt_y
738
739	IMPLICIT NONE
740
741	INTEGER(iwp) :: i !<
742	INTEGER(iwp) :: j !<
743	INTEGER(iwp) :: k !<
744
745	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
746	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
747
748	!
749	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
750	!$OMP PARALLEL PRIVATE ( i, j, k )
751	!$OMP DO
752	#if __acc_fft_device
753	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
754	!$ACC PRESENT(f_inv, f_in)
755	#endif
756	DO k = nzb_y, nzt_y
757	DO i = nxl_y, nxr_y
758	DO j = 0, ny
759	f_inv(i,k,j) = f_in(j,i,k)
760	ENDDO
761	ENDDO
762	ENDDO
763	!$OMP END PARALLEL
764
765	END SUBROUTINE resort_for_yz
766
767
768	!--------------------------------------------------------------------------------------------------!
769	! Description:
770	! ------------
771	!> Transposition of input array (f_in) from y to z. For the input array, all elements along y reside
772	!> on the same PE, while after transposition, all elements along z reside on the same PE.
773	!--------------------------------------------------------------------------------------------------!
774	SUBROUTINE transpose_yz( f_inv, f_out )
775
776
777	#if defined( __parallel )
778	USE cpulog, &
779	ONLY: cpu_log, &
780	cpu_log_nowait, &
781	log_point_s
782	#endif
783
784	USE indices, &
785	ONLY: ny, &
786	nz
787
788	USE kinds
789
790	USE pegrid
791
792	USE transpose_indices, &
793	ONLY: nxl_y, &
794	nxl_z, &
795	nxr_y, &
796	nxr_z, &
797	nyn_z, &
798	nys_z, &
799	nzb_y, &
800	nzt_y
801
802	IMPLICIT NONE
803
804	INTEGER(iwp) :: i !<
805	INTEGER(iwp) :: j !<
806	INTEGER(iwp) :: k !<
807
808	#if defined( __parallel )
809	INTEGER(iwp) :: l !<
810	INTEGER(iwp) :: zs !<
811	#endif
812
813	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
814	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
815
816	#if defined( __parallel )
817	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
818	#if __acc_fft_device
819	!$ACC DECLARE CREATE(work)
820	#endif
821	#endif
822
823
824	!
825	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
826	!-- transposition has to be done.
827	IF ( pdims(1) == 1 ) THEN
828
829	!$OMP PARALLEL PRIVATE ( i, j, k )
830	!$OMP DO
831	#if __acc_fft_device
832	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
833	!$ACC PRESENT(f_out, f_inv)
834	#endif
835	DO j = 0, ny
836	DO k = nzb_y, nzt_y
837	DO i = nxl_y, nxr_y
838	f_out(i,j,k) = f_inv(i,k,j)
839	ENDDO
840	ENDDO
841	ENDDO
842	!$OMP END PARALLEL
843
844	ELSE
845
846	#if defined( __parallel )
847	!
848	!-- Transpose array
849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
850
851	#if __acc_fft_device
852	#ifndef __cuda_aware_mpi
853	!$ACC UPDATE HOST(f_inv)
854	#else
855	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
856	#endif
857	#endif
858
859	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
860	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
861	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
862
863	#if __acc_fft_device
864	#ifndef __cuda_aware_mpi
865	!$ACC UPDATE DEVICE(work)
866	#else
867	!$ACC END HOST_DATA
868	#endif
869	#endif
870
871	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
872
873	!
874	!-- Reorder transposed array
875	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
876	DO l = 0, pdims(1) - 1
877	zs = 1 + l * ( nzt_y - nzb_y + 1 )
878	#if __acc_fft_device
879	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
880	!$ACC PRESENT(f_out, work)
881	#endif
882	!$OMP DO
883	DO j = nys_z, nyn_z
884	DO k = zs, zs + nzt_y - nzb_y
885	DO i = nxl_z, nxr_z
886	f_out(i,j,k) = work(i,k-zs+1,j,l)
887	ENDDO
888	ENDDO
889	ENDDO
890	!$OMP END DO NOWAIT
891	ENDDO
892	!$OMP END PARALLEL
893	#endif
894
895	ENDIF
896
897	END SUBROUTINE transpose_yz
898
899
900	!--------------------------------------------------------------------------------------------------!
901	! Description:
902	! ------------
903	!> Resorting data for the transposition from z to x. The transposition itself is carried out in
904	!> transpose_zx.
905	!--------------------------------------------------------------------------------------------------!
906	SUBROUTINE resort_for_zx( f_in, f_inv )
907
908
909	USE indices, &
910	ONLY: nxl, &
911	nxr, &
912	nyn, &
913	nys, &
914	nz
915
916	USE kinds
917
918	IMPLICIT NONE
919
920	INTEGER(iwp) :: i !<
921	INTEGER(iwp) :: j !<
922	INTEGER(iwp) :: k !<
923
924	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
925	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
926
927	!
928	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
929	!$OMP PARALLEL PRIVATE ( i, j, k )
930	!$OMP DO
931	#if __acc_fft_device
932	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
933	!$ACC PRESENT(f_in, f_inv)
934	#endif
935	DO i = nxl, nxr
936	DO j = nys, nyn
937	DO k = 1,nz
938	f_inv(j,i,k) = f_in(k,j,i)
939	ENDDO
940	ENDDO
941	ENDDO
942	!$OMP END PARALLEL
943
944	END SUBROUTINE resort_for_zx
945
946
947	!--------------------------------------------------------------------------------------------------!
948	! Description:
949	! ------------
950	!> Transposition of input array (f_in) from z to x. For the input array, all elements along z reside
951	!> on the same PE, while after transposition, all elements along x reside on the same PE.
952	!--------------------------------------------------------------------------------------------------!
953	SUBROUTINE transpose_zx( f_inv, f_out )
954
955
956	#if defined( __parallel )
957	USE cpulog, &
958	ONLY: cpu_log, &
959	cpu_log_nowait, &
960	log_point_s
961
962	USE fft_xy, &
963	ONLY: f_vec_x, &
964	temperton_fft_vec
965	#endif
966
967	USE indices, &
968	ONLY: nx, &
969	nxl, &
970	nxr, &
971	nyn, &
972	nys, &
973	nz
974
975	#if defined( __parallel )
976	USE indices, &
977	ONLY: nnx
978	#endif
979
980	USE kinds
981
982	USE pegrid
983
984	USE transpose_indices, &
985	ONLY: nyn_x, &
986	nys_x, &
987	nzb_x, &
988	nzt_x
989
990	IMPLICIT NONE
991
992	INTEGER(iwp) :: i !<
993	INTEGER(iwp) :: j !<
994	INTEGER(iwp) :: k !<
995
996	#if defined( __parallel )
997	INTEGER(iwp) :: l !<
998	INTEGER(iwp) :: mm !<
999	INTEGER(iwp) :: xs !<
1000	#endif
1001
1002	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1003	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
1004
1005	#if defined( __parallel )
1006	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
1007	#if __acc_fft_device
1008	!$ACC DECLARE CREATE(work)
1009	#endif
1010	#endif
1011
1012
1013	!
1014	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
1015	!-- transposition has to be done.
1016	IF ( pdims(1) == 1 ) THEN
1017
1018	!$OMP PARALLEL PRIVATE ( i, j, k )
1019	!$OMP DO
1020	#if __acc_fft_device
1021	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1022	!$ACC PRESENT(f_out, f_inv)
1023	#endif
1024	DO k = 1, nz
1025	DO i = nxl, nxr
1026	DO j = nys, nyn
1027	f_out(i,j,k) = f_inv(j,i,k)
1028	ENDDO
1029	ENDDO
1030	ENDDO
1031	!$OMP END PARALLEL
1032
1033	ELSE
1034
1035	#if defined( __parallel )
1036	!
1037	!-- Transpose array
1038	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1039
1040	#if __acc_fft_device
1041	#ifndef __cuda_aware_mpi
1042	!$ACC UPDATE HOST(f_inv)
1043	#else
1044	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1045	#endif
1046	#endif
1047
1048	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1049	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
1050	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
1051
1052	#if __acc_fft_device
1053	#ifndef __cuda_aware_mpi
1054	!$ACC UPDATE DEVICE(work)
1055	#else
1056	!$ACC END HOST_DATA
1057	#endif
1058	#endif
1059
1060	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1061
1062	!
1063	!-- Reorder transposed array.
1064	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
1065	!-- saves additional data copy in fft_x.
1066	IF ( temperton_fft_vec ) THEN
1067
1068	DO l = 0, pdims(1) - 1
1069	xs = 0 + l * nnx
1070	DO k = nzb_x, nzt_x
1071	DO i = xs, xs + nnx - 1
1072	DO j = nys_x, nyn_x
1073	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
1074	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
1075	ENDDO
1076	ENDDO
1077	ENDDO
1078	ENDDO
1079
1080	ELSE
1081
1082	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
1083	DO l = 0, pdims(1) - 1
1084	xs = 0 + l * nnx
1085	#if __acc_fft_device
1086	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1087	!$ACC PRESENT(f_out, work)
1088	#endif
1089	!$OMP DO
1090	DO k = nzb_x, nzt_x
1091	DO i = xs, xs + nnx - 1
1092	DO j = nys_x, nyn_x
1093	f_out(i,j,k) = work(j,i-xs+1,k,l)
1094	ENDDO
1095	ENDDO
1096	ENDDO
1097	!$OMP END DO NOWAIT
1098	ENDDO
1099	!$OMP END PARALLEL
1100
1101	ENDIF
1102
1103	#endif
1104
1105	ENDIF
1106
1107	END SUBROUTINE transpose_zx
1108
1109
1110	!--------------------------------------------------------------------------------------------------!
1111	! Description:
1112	! ------------
1113	!> Resorting data after the transposition from z to y. The transposition itself is carried out in
1114	!> transpose_zy.
1115	!--------------------------------------------------------------------------------------------------!
1116	SUBROUTINE resort_for_zy( f_inv, f_out )
1117
1118
1119	USE indices, &
1120	ONLY: ny
1121
1122	USE kinds
1123
1124	USE transpose_indices, &
1125	ONLY: nxl_y, &
1126	nxr_y, &
1127	nzb_y, &
1128	nzt_y
1129
1130	IMPLICIT NONE
1131
1132	INTEGER(iwp) :: i !<
1133	INTEGER(iwp) :: j !<
1134	INTEGER(iwp) :: k !<
1135
1136	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1137	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1138
1139	!
1140	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
1141	!$OMP PARALLEL PRIVATE ( i, j, k )
1142	!$OMP DO
1143	#if __acc_fft_device
1144	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1145	!$ACC PRESENT(f_out, f_inv)
1146	#endif
1147	DO k = nzb_y, nzt_y
1148	DO i = nxl_y, nxr_y
1149	DO j = 0, ny
1150	f_out(j,i,k) = f_inv(i,k,j)
1151	ENDDO
1152	ENDDO
1153	ENDDO
1154	!$OMP END PARALLEL
1155
1156	END SUBROUTINE resort_for_zy
1157
1158
1159	!--------------------------------------------------------------------------------------------------!
1160	! Description:cpu_log_nowait
1161	! ------------
1162	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
1163	!> on the same PE, while after transposition, all elements along y reside on the same PE.
1164	!--------------------------------------------------------------------------------------------------!
1165	SUBROUTINE transpose_zy( f_in, f_inv )
1166
1167
1168	#if defined( __parallel )
1169	USE cpulog, &
1170	ONLY: cpu_log, &
1171	cpu_log_nowait, &
1172	log_point_s
1173	#endif
1174
1175	USE indices, &
1176	ONLY: ny, &
1177	nz
1178
1179	USE kinds
1180
1181	USE pegrid
1182
1183	USE transpose_indices, &
1184	ONLY: nxl_y, &
1185	nxl_z, &
1186	nxr_y, &
1187	nxr_z, &
1188	nyn_z, &
1189	nys_z, &
1190	nzb_y, &
1191	nzt_y
1192
1193	IMPLICIT NONE
1194
1195	INTEGER(iwp) :: i !<
1196	INTEGER(iwp) :: j !<
1197	INTEGER(iwp) :: k !<
1198
1199	#if defined( __parallel )
1200	INTEGER(iwp) :: l !<
1201	INTEGER(iwp) :: zs !<
1202	#endif
1203
1204	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1205	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1206
1207	#if defined( __parallel )
1208	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1209	#if __acc_fft_device
1210	!$ACC DECLARE CREATE(work)
1211	#endif
1212	#endif
1213
1214	!
1215	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
1216	!-- therefore no transposition has to be done.
1217	IF ( pdims(1) /= 1 ) THEN
1218
1219	#if defined( __parallel )
1220	!
1221	!-- Reorder input array for transposition
1222	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1223	DO l = 0, pdims(1) - 1
1224	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1225	#if __acc_fft_device
1226	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1227	!$ACC PRESENT(work, f_in)
1228	#endif
1229	!$OMP DO
1230	DO j = nys_z, nyn_z
1231	DO k = zs, zs + nzt_y - nzb_y
1232	DO i = nxl_z, nxr_z
1233	work(i,k-zs+1,j,l) = f_in(i,j,k)
1234	ENDDO
1235	ENDDO
1236	ENDDO
1237	!$OMP END DO NOWAIT
1238	ENDDO
1239	!$OMP END PARALLEL
1240
1241	!
1242	!-- Transpose array
1243	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1244
1245	#if __acc_fft_device
1246	#ifndef __cuda_aware_mpi
1247	!$ACC UPDATE HOST(work)
1248	#else
1249	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1250	#endif
1251	#endif
1252
1253	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1254	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1255	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
1256
1257	#if __acc_fft_device
1258	#ifndef __cuda_aware_mpi
1259	!$ACC UPDATE DEVICE(f_inv)
1260	#else
1261	!$ACC END HOST_DATA
1262	#endif
1263	#endif
1264
1265	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1266	#endif
1267
1268	ELSE
1269	!
1270	!-- Reorder the array in the same way like ALLTOALL did it
1271	!$OMP PARALLEL PRIVATE ( i, j, k )
1272	!$OMP DO
1273	#if __acc_fft_device
1274	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1275	!$ACC PRESENT(f_inv, f_in)
1276	#endif
1277	DO k = nzb_y, nzt_y
1278	DO j = 0, ny
1279	DO i = nxl_y, nxr_y
1280	f_inv(i,k,j) = f_in(i,j,k)
1281	ENDDO
1282	ENDDO
1283	ENDDO
1284	!$OMP END PARALLEL
1285
1286	ENDIF
1287
1288	END SUBROUTINE transpose_zy
1289
1290
1291	!--------------------------------------------------------------------------------------------------!
1292	! Description:
1293	! ------------
1294	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
1295	!> on the same PE, while after transposition, all elements along y reside on the same PE. This is a
1296	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_zy).
1297	!--------------------------------------------------------------------------------------------------!
1298	#if defined( __parallel )
1299	SUBROUTINE transpose_zyd( f_in, f_out )
1300
1301
1302	USE cpulog, &
1303	ONLY: cpu_log, &
1304	log_point_s
1305
1306	USE indices, &
1307	ONLY: nnx, &
1308	nny, &
1309	nnz, &
1310	nxl, &
1311	nxr, &
1312	nyn, &
1313	nys, &
1314	ny, &
1315	nz
1316
1317	USE kinds
1318
1319	USE pegrid
1320
1321	USE transpose_indices, &
1322	ONLY: nxl_yd, &
1323	nxr_yd, &
1324	nzb_yd, &
1325	nzt_yd
1326
1327	IMPLICIT NONE
1328
1329	INTEGER(iwp) :: i !<
1330	INTEGER(iwp) :: j !<
1331	INTEGER(iwp) :: k !<
1332	INTEGER(iwp) :: l !<
1333	INTEGER(iwp) :: m !<
1334	INTEGER(iwp) :: ys !<
1335
1336	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1337	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1338	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1339	REAL(wp) :: work(nnxnnynnz) !<
1340
1341	!
1342	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
1343	DO i = nxl, nxr
1344	DO j = nys, nyn
1345	DO k = 1, nz
1346	f_inv(j,i,k) = f_in(k,j,i)
1347	ENDDO
1348	ENDDO
1349	ENDDO
1350
1351	!
1352	!-- Move data to different array, because memory location of work1 is needed further below
1353	!-- (work1 = work2). If the PE grid is one-dimensional along x, only local reordering of the data is
1354	!-- necessary and no transposition has to be done.
1355	IF ( pdims(2) == 1 ) THEN
1356	DO k = 1, nz
1357	DO i = nxl, nxr
1358	DO j = nys, nyn
1359	f_out(j,i,k) = f_inv(j,i,k)
1360	ENDDO
1361	ENDDO
1362	ENDDO
1363	RETURN
1364	ENDIF
1365
1366	!
1367	!-- Transpose array
1368	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1369	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1370	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1371	work(1), sendrecvcount_zyd, MPI_REAL, comm1dy, ierr )
1372	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1373
1374	!
1375	!-- Reorder transposed array
1376	m = 0
1377	DO l = 0, pdims(2) - 1
1378	ys = 0 + l * nny
1379	DO k = nzb_yd, nzt_yd
1380	DO i = nxl_yd, nxr_yd
1381	DO j = ys, ys + nny - 1
1382	m = m + 1
1383	f_out(j,i,k) = work(m)
1384	ENDDO
1385	ENDDO
1386	ENDDO
1387	ENDDO
1388
1389	END SUBROUTINE transpose_zyd
1390	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |