Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4528

Last change on this file since 4528 was 4429, checked in by raasch, 5 years ago
serial (non-MPI) test case added, several bugfixes for the serial mode
Property svn:keywords set to `Id`
File size: 35.1 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2020 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4429 2020-02-27 15:24:30Z oliver.maas $
27	! bugfix: cpp-directives added for serial mode
28	!
29	! 4415 2020-02-20 10:30:33Z raasch
30	! bugfix for misplaced preprocessor directive
31	!
32	! 4370 2020-01-10 14:00:44Z raasch
33	! vector array renamed
34	!
35	! 4366 2020-01-09 08:12:43Z raasch
36	! modifications for NEC vectorization
37	!
38	! 4360 2020-01-07 11:25:50Z suehring
39	! Added missing OpenMP directives
40	!
41	! 4182 2019-08-22 15:20:23Z scharf
42	! Corrected "Former revisions" section
43	!
44	! 4171 2019-08-19 17:44:09Z gronemeier
45	! loop reordering for performance optimization
46	!
47	! 3832 2019-03-28 13:16:58Z raasch
48	! loop reordering for performance optimization
49	!
50	! 3694 2019-01-23 17:01:49Z knoop
51	! OpenACC port for SPEC
52	!
53	! Revision 1.1 1997/07/24 11:25:18 raasch
54	! Initial revision
55	!
56	!
57	! Description:
58	! ------------
59	!> Resorting data for the transposition from x to y. The transposition itself
60	!> is carried out in transpose_xy
61	!------------------------------------------------------------------------------!
62
63	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
64
65	SUBROUTINE resort_for_xy( f_in, f_inv )
66
67
68	USE indices, &
69	ONLY: nx
70
71	USE kinds
72
73	USE transpose_indices, &
74	ONLY: nyn_x, nys_x, nzb_x, nzt_x
75
76	IMPLICIT NONE
77
78	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
79	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
80
81
82	INTEGER(iwp) :: i !<
83	INTEGER(iwp) :: j !<
84	INTEGER(iwp) :: k !<
85	!
86	!-- Rearrange indices of input array in order to make data to be send
87	!-- by MPI contiguous
88	!$OMP PARALLEL PRIVATE ( i, j, k )
89	!$OMP DO
90	#if __acc_fft_device
91	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
92	!$ACC PRESENT(f_inv, f_in)
93	#endif
94	DO k = nzb_x, nzt_x
95	DO j = nys_x, nyn_x
96	DO i = 0, nx
97	f_inv(j,k,i) = f_in(i,j,k)
98	ENDDO
99	ENDDO
100	ENDDO
101	!$OMP END PARALLEL
102
103	END SUBROUTINE resort_for_xy
104
105
106	!------------------------------------------------------------------------------!
107	! Description:
108	! ------------
109	!> Transposition of input array (f_in) from x to y. For the input array, all
110	!> elements along x reside on the same PE, while after transposition, all
111	!> elements along y reside on the same PE.
112	!------------------------------------------------------------------------------!
113	SUBROUTINE transpose_xy( f_inv, f_out )
114
115
116	#if defined( __parallel )
117	USE cpulog, &
118	ONLY: cpu_log, cpu_log_nowait, log_point_s
119	#endif
120
121	USE indices, &
122	ONLY: nx, ny
123
124	USE kinds
125
126	USE pegrid
127
128	USE transpose_indices, &
129	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
130
131	IMPLICIT NONE
132
133	INTEGER(iwp) :: i !<
134	INTEGER(iwp) :: j !<
135	INTEGER(iwp) :: k !<
136
137	#if defined( __parallel )
138	INTEGER(iwp) :: l !<
139	INTEGER(iwp) :: ys !<
140	#endif
141
142	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
143	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
144
145	#if defined( __parallel )
146	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
147	#if __acc_fft_device
148	!$ACC DECLARE CREATE(work)
149	#endif
150	#endif
151
152
153	IF ( numprocs /= 1 ) THEN
154
155	#if defined( __parallel )
156	!
157	!-- Transpose array
158	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
159
160	#if __acc_fft_device
161	#ifndef __cuda_aware_mpi
162	!$ACC UPDATE HOST(f_inv)
163	#else
164	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
165	#endif
166	#endif
167
168	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
169	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
170	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
171	comm1dy, ierr )
172
173	#if __acc_fft_device
174	#ifndef __cuda_aware_mpi
175	!$ACC UPDATE DEVICE(work)
176	#else
177	!$ACC END HOST_DATA
178	#endif
179	#endif
180
181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
182
183	!
184	!-- Reorder transposed array
185	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
186	DO l = 0, pdims(2) - 1
187	ys = 0 + l * ( nyn_x - nys_x + 1 )
188	#if __acc_fft_device
189	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
190	!$ACC PRESENT(f_out, work)
191	#endif
192	!$OMP DO
193	DO i = nxl_y, nxr_y
194	DO k = nzb_y, nzt_y
195	DO j = ys, ys + nyn_x - nys_x
196	f_out(j,i,k) = work(j-ys+1,k,i,l)
197	ENDDO
198	ENDDO
199	ENDDO
200	!$OMP END DO NOWAIT
201	ENDDO
202	!$OMP END PARALLEL
203	#endif
204
205	ELSE
206
207	!
208	!-- Reorder transposed array
209	!$OMP PARALLEL PRIVATE ( i, j, k )
210	!$OMP DO
211	#if __acc_fft_device
212	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
213	!$ACC PRESENT(f_out, f_inv)
214	#endif
215	DO k = nzb_y, nzt_y
216	DO i = nxl_y, nxr_y
217	DO j = 0, ny
218	f_out(j,i,k) = f_inv(j,k,i)
219	ENDDO
220	ENDDO
221	ENDDO
222	!$OMP END PARALLEL
223
224	ENDIF
225
226	END SUBROUTINE transpose_xy
227
228
229	!------------------------------------------------------------------------------!
230	! Description:
231	! ------------
232	!> Resorting data after the transposition from x to z. The transposition itself
233	!> is carried out in transpose_xz
234	!------------------------------------------------------------------------------!
235	SUBROUTINE resort_for_xz( f_inv, f_out )
236
237
238	USE indices, &
239	ONLY: nxl, nxr, nyn, nys, nz
240
241	USE kinds
242
243	IMPLICIT NONE
244
245	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
246	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
247
248	INTEGER(iwp) :: i !<
249	INTEGER(iwp) :: j !<
250	INTEGER(iwp) :: k !<
251	!
252	!-- Rearrange indices of input array in order to make data to be send
253	!-- by MPI contiguous.
254	!-- In case of parallel fft/transposition, scattered store is faster in
255	!-- backward direction!!!
256	!$OMP PARALLEL PRIVATE ( i, j, k )
257	!$OMP DO
258	#if __acc_fft_device
259	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
260	!$ACC PRESENT(f_out, f_inv)
261	#endif
262	DO i = nxl, nxr
263	DO j = nys, nyn
264	DO k = 1, nz
265	f_out(k,j,i) = f_inv(j,i,k)
266	ENDDO
267	ENDDO
268	ENDDO
269	!$OMP END PARALLEL
270
271	END SUBROUTINE resort_for_xz
272
273
274	!------------------------------------------------------------------------------!
275	! Description:
276	! ------------
277	!> Transposition of input array (f_in) from x to z. For the input array, all
278	!> elements along x reside on the same PE, while after transposition, all
279	!> elements along z reside on the same PE.
280	!------------------------------------------------------------------------------!
281	SUBROUTINE transpose_xz( f_in, f_inv )
282
283	#if defined( __parallel )
284	USE cpulog, &
285	ONLY: cpu_log, cpu_log_nowait, log_point_s
286
287	USE fft_xy, &
288	ONLY: f_vec_x, temperton_fft_vec
289	#endif
290
291	USE indices, &
292	ONLY: nx, nxl, nxr, nyn, nys, nz
293	#if defined( __parallel )
294	USE indices, &
295	ONLY: nnx
296	#endif
297
298	USE kinds
299
300	USE pegrid
301
302	USE transpose_indices, &
303	ONLY: nyn_x, nys_x, nzb_x, nzt_x
304
305	IMPLICIT NONE
306
307	INTEGER(iwp) :: i !<
308	INTEGER(iwp) :: j !<
309	INTEGER(iwp) :: k !<
310	#if defined( __parallel )
311	INTEGER(iwp) :: l !<
312	INTEGER(iwp) :: mm !<
313	INTEGER(iwp) :: xs !<
314	#endif
315
316	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
317	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
318
319	#if defined( __parallel )
320	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
321	#if __acc_fft_device
322	!$ACC DECLARE CREATE(work)
323	#endif
324	#endif
325
326	!
327	!-- If the PE grid is one-dimensional along y, the array has only to be
328	!-- reordered locally and therefore no transposition has to be done.
329	IF ( pdims(1) /= 1 ) THEN
330
331	#if defined( __parallel )
332	!
333	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
334	!-- different array format (f_vec_x).
335	IF ( temperton_fft_vec ) THEN
336
337	DO l = 0, pdims(1) - 1
338	xs = 0 + l * nnx
339	DO k = nzb_x, nzt_x
340	DO i = xs, xs + nnx - 1
341	DO j = nys_x, nyn_x
342	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
343	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
344	ENDDO
345	ENDDO
346	ENDDO
347	ENDDO
348
349	ELSE
350
351	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
352	DO l = 0, pdims(1) - 1
353	xs = 0 + l * nnx
354	#if __acc_fft_device
355	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
356	!$ACC PRESENT(work, f_in)
357	#endif
358	!$OMP DO
359	DO k = nzb_x, nzt_x
360	DO i = xs, xs + nnx - 1
361	DO j = nys_x, nyn_x
362	work(j,i-xs+1,k,l) = f_in(i,j,k)
363	ENDDO
364	ENDDO
365	ENDDO
366	!$OMP END DO NOWAIT
367	ENDDO
368	!$OMP END PARALLEL
369
370	ENDIF
371
372	!
373	!-- Transpose array
374	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
375
376	#if __acc_fft_device
377	#ifndef __cuda_aware_mpi
378	!$ACC UPDATE HOST(work)
379	#else
380	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
381	#endif
382	#endif
383
384	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
385	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
386	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
387	comm1dx, ierr )
388
389	#if __acc_fft_device
390	#ifndef __cuda_aware_mpi
391	!$ACC UPDATE DEVICE(f_inv)
392	#else
393	!$ACC END HOST_DATA
394	#endif
395	#endif
396
397	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
398	#endif
399
400	ELSE
401
402	!
403	!-- Reorder the array in a way that the z index is in first position
404	!$OMP PARALLEL PRIVATE ( i, j, k )
405	!$OMP DO
406	#if __acc_fft_device
407	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
408	!$ACC PRESENT(f_inv, f_in)
409	#endif
410	DO i = nxl, nxr
411	DO j = nys, nyn
412	DO k = 1, nz
413	f_inv(j,i,k) = f_in(i,j,k)
414	ENDDO
415	ENDDO
416	ENDDO
417	!$OMP END PARALLEL
418
419	ENDIF
420
421	END SUBROUTINE transpose_xz
422
423
424	!------------------------------------------------------------------------------!
425	! Description:
426	! ------------
427	!> Resorting data after the transposition from y to x. The transposition itself
428	!> is carried out in transpose_yx
429	!------------------------------------------------------------------------------!
430	SUBROUTINE resort_for_yx( f_inv, f_out )
431
432
433	USE indices, &
434	ONLY: nx
435
436	USE kinds
437
438	USE transpose_indices, &
439	ONLY: nyn_x, nys_x, nzb_x, nzt_x
440
441	IMPLICIT NONE
442
443	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
444	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
445
446
447	INTEGER(iwp) :: i !<
448	INTEGER(iwp) :: j !<
449	INTEGER(iwp) :: k !<
450	!
451	!-- Rearrange indices of input array in order to make data to be send
452	!-- by MPI contiguous
453	!$OMP PARALLEL PRIVATE ( i, j, k )
454	!$OMP DO
455	#if __acc_fft_device
456	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
457	!$ACC PRESENT(f_out, f_inv)
458	#endif
459	DO k = nzb_x, nzt_x
460	DO j = nys_x, nyn_x
461	DO i = 0, nx
462	f_out(i,j,k) = f_inv(j,k,i)
463	ENDDO
464	ENDDO
465	ENDDO
466	!$OMP END PARALLEL
467
468	END SUBROUTINE resort_for_yx
469
470
471	!------------------------------------------------------------------------------!
472	! Description:
473	! ------------
474	!> Transposition of input array (f_in) from y to x. For the input array, all
475	!> elements along y reside on the same PE, while after transposition, all
476	!> elements along x reside on the same PE.
477	!------------------------------------------------------------------------------!
478	SUBROUTINE transpose_yx( f_in, f_inv )
479
480
481	#if defined( __parallel )
482	USE cpulog, &
483	ONLY: cpu_log, cpu_log_nowait, log_point_s
484	#endif
485
486	USE indices, &
487	ONLY: nx, ny
488
489	USE kinds
490
491	USE pegrid
492
493	USE transpose_indices, &
494	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
495
496	IMPLICIT NONE
497
498	INTEGER(iwp) :: i !<
499	INTEGER(iwp) :: j !<
500	INTEGER(iwp) :: k !<
501	#if defined( __parallel )
502	INTEGER(iwp) :: l !<
503	INTEGER(iwp) :: ys !<
504	#endif
505
506	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
507	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
508
509	#if defined( __parallel )
510	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
511	#if __acc_fft_device
512	!$ACC DECLARE CREATE(work)
513	#endif
514	#endif
515
516
517	IF ( numprocs /= 1 ) THEN
518
519	#if defined( __parallel )
520	!
521	!-- Reorder input array for transposition
522	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
523	DO l = 0, pdims(2) - 1
524	ys = 0 + l * ( nyn_x - nys_x + 1 )
525	#if __acc_fft_device
526	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
527	!$ACC PRESENT(work, f_in)
528	#endif
529	!$OMP DO
530	DO i = nxl_y, nxr_y
531	DO k = nzb_y, nzt_y
532	DO j = ys, ys + nyn_x - nys_x
533	work(j-ys+1,k,i,l) = f_in(j,i,k)
534	ENDDO
535	ENDDO
536	ENDDO
537	!$OMP END DO NOWAIT
538	ENDDO
539	!$OMP END PARALLEL
540
541	!
542	!-- Transpose array
543	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
544
545	#if __acc_fft_device
546	#ifndef __cuda_aware_mpi
547	!$ACC UPDATE HOST(work)
548	#else
549	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
550	#endif
551	#endif
552
553	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
554	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
555	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
556	comm1dy, ierr )
557
558	#if __acc_fft_device
559	#ifndef __cuda_aware_mpi
560	!$ACC UPDATE DEVICE(f_inv)
561	#else
562	!$ACC END HOST_DATA
563	#endif
564	#endif
565
566	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
567	#endif
568
569	ELSE
570
571	!
572	!-- Reorder array f_in the same way as ALLTOALL did it
573	!$OMP PARALLEL PRIVATE ( i, j, k )
574	!$OMP DO
575	#if __acc_fft_device
576	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
577	!$ACC PRESENT(f_inv, f_in)
578	#endif
579	DO i = nxl_y, nxr_y
580	DO k = nzb_y, nzt_y
581	DO j = 0, ny
582	f_inv(j,k,i) = f_in(j,i,k)
583	ENDDO
584	ENDDO
585	ENDDO
586	!$OMP END PARALLEL
587
588	ENDIF
589
590	END SUBROUTINE transpose_yx
591
592
593	!------------------------------------------------------------------------------!
594	! Description:
595	! ------------
596	!> Transposition of input array (f_in) from y to x. For the input array, all
597	!> elements along y reside on the same PE, while after transposition, all
598	!> elements along x reside on the same PE.
599	!> This is a direct transposition for arrays with indices in regular order
600	!> (k,j,i) (cf. transpose_yx).
601	!------------------------------------------------------------------------------!
602	#if defined( __parallel )
603	SUBROUTINE transpose_yxd( f_in, f_out )
604
605
606	USE cpulog, &
607	ONLY: cpu_log, log_point_s
608
609	USE indices, &
610	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
611
612	USE kinds
613
614	USE pegrid
615
616	USE transpose_indices, &
617	ONLY: nyn_x, nys_x, nzb_x, nzt_x
618
619	IMPLICIT NONE
620
621	INTEGER(iwp) :: i !<
622	INTEGER(iwp) :: j !<
623	INTEGER(iwp) :: k !<
624	INTEGER(iwp) :: l !<
625	INTEGER(iwp) :: m !<
626	INTEGER(iwp) :: xs !<
627
628	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
629	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
630	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
631	REAL(wp) :: work(nnxnnynnz) !<
632
633	!
634	!-- Rearrange indices of input array in order to make data to be send
635	!-- by MPI contiguous
636	DO k = 1, nz
637	DO j = nys, nyn
638	DO i = nxl, nxr
639	f_inv(i,k,j) = f_in(k,j,i)
640	ENDDO
641	ENDDO
642	ENDDO
643
644	!
645	!-- Transpose array
646	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
647	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
648	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
649	work(1), sendrecvcount_xy, MPI_REAL, &
650	comm1dx, ierr )
651	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
652
653	!
654	!-- Reorder transposed array
655	m = 0
656	DO l = 0, pdims(1) - 1
657	xs = 0 + l * nnx
658	DO j = nys_x, nyn_x
659	DO k = 1, nz
660	DO i = xs, xs + nnx - 1
661	m = m + 1
662	f_out(i,j,k) = work(m)
663	ENDDO
664	ENDDO
665	ENDDO
666	ENDDO
667
668	END SUBROUTINE transpose_yxd
669	#endif
670
671
672	!------------------------------------------------------------------------------!
673	! Description:
674	! ------------
675	!> Resorting data for the transposition from y to z. The transposition itself
676	!> is carried out in transpose_yz
677	!------------------------------------------------------------------------------!
678	SUBROUTINE resort_for_yz( f_in, f_inv )
679
680
681	USE indices, &
682	ONLY: ny
683
684	USE kinds
685
686	USE transpose_indices, &
687	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
688
689	IMPLICIT NONE
690
691	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
692	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
693
694	INTEGER(iwp) :: i !<
695	INTEGER(iwp) :: j !<
696	INTEGER(iwp) :: k !<
697
698	!
699	!-- Rearrange indices of input array in order to make data to be send
700	!-- by MPI contiguous
701	!$OMP PARALLEL PRIVATE ( i, j, k )
702	!$OMP DO
703	#if __acc_fft_device
704	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
705	!$ACC PRESENT(f_inv, f_in)
706	#endif
707	DO k = nzb_y, nzt_y
708	DO i = nxl_y, nxr_y
709	DO j = 0, ny
710	f_inv(i,k,j) = f_in(j,i,k)
711	ENDDO
712	ENDDO
713	ENDDO
714	!$OMP END PARALLEL
715
716	END SUBROUTINE resort_for_yz
717
718
719	!------------------------------------------------------------------------------!
720	! Description:
721	! ------------
722	!> Transposition of input array (f_in) from y to z. For the input array, all
723	!> elements along y reside on the same PE, while after transposition, all
724	!> elements along z reside on the same PE.
725	!------------------------------------------------------------------------------!
726	SUBROUTINE transpose_yz( f_inv, f_out )
727
728
729	#if defined( __parallel )
730	USE cpulog, &
731	ONLY: cpu_log, cpu_log_nowait, log_point_s
732	#endif
733
734	USE indices, &
735	ONLY: ny, nz
736
737	USE kinds
738
739	USE pegrid
740
741	USE transpose_indices, &
742	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
743
744	IMPLICIT NONE
745
746	INTEGER(iwp) :: i !<
747	INTEGER(iwp) :: j !<
748	INTEGER(iwp) :: k !<
749	#if defined( __parallel )
750	INTEGER(iwp) :: l !<
751	INTEGER(iwp) :: zs !<
752	#endif
753
754	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
755	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
756
757	#if defined( __parallel )
758	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
759	#if __acc_fft_device
760	!$ACC DECLARE CREATE(work)
761	#endif
762	#endif
763
764
765	!
766	!-- If the PE grid is one-dimensional along y, only local reordering
767	!-- of the data is necessary and no transposition has to be done.
768	IF ( pdims(1) == 1 ) THEN
769
770	!$OMP PARALLEL PRIVATE ( i, j, k )
771	!$OMP DO
772	#if __acc_fft_device
773	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
774	!$ACC PRESENT(f_out, f_inv)
775	#endif
776	DO j = 0, ny
777	DO k = nzb_y, nzt_y
778	DO i = nxl_y, nxr_y
779	f_out(i,j,k) = f_inv(i,k,j)
780	ENDDO
781	ENDDO
782	ENDDO
783	!$OMP END PARALLEL
784
785	ELSE
786
787	#if defined( __parallel )
788	!
789	!-- Transpose array
790	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
791
792	#if __acc_fft_device
793	#ifndef __cuda_aware_mpi
794	!$ACC UPDATE HOST(f_inv)
795	#else
796	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
797	#endif
798	#endif
799
800	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
801	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
802	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
803	comm1dx, ierr )
804
805	#if __acc_fft_device
806	#ifndef __cuda_aware_mpi
807	!$ACC UPDATE DEVICE(work)
808	#else
809	!$ACC END HOST_DATA
810	#endif
811	#endif
812
813	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
814
815	!
816	!-- Reorder transposed array
817	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
818	DO l = 0, pdims(1) - 1
819	zs = 1 + l * ( nzt_y - nzb_y + 1 )
820	#if __acc_fft_device
821	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
822	!$ACC PRESENT(f_out, work)
823	#endif
824	!$OMP DO
825	DO j = nys_z, nyn_z
826	DO k = zs, zs + nzt_y - nzb_y
827	DO i = nxl_z, nxr_z
828	f_out(i,j,k) = work(i,k-zs+1,j,l)
829	ENDDO
830	ENDDO
831	ENDDO
832	!$OMP END DO NOWAIT
833	ENDDO
834	!$OMP END PARALLEL
835	#endif
836
837	ENDIF
838
839	END SUBROUTINE transpose_yz
840
841
842	!------------------------------------------------------------------------------!
843	! Description:
844	! ------------
845	!> Resorting data for the transposition from z to x. The transposition itself
846	!> is carried out in transpose_zx
847	!------------------------------------------------------------------------------!
848	SUBROUTINE resort_for_zx( f_in, f_inv )
849
850
851	USE indices, &
852	ONLY: nxl, nxr, nyn, nys, nz
853
854	USE kinds
855
856	IMPLICIT NONE
857
858	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
859	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
860
861	INTEGER(iwp) :: i !<
862	INTEGER(iwp) :: j !<
863	INTEGER(iwp) :: k !<
864
865	!
866	!-- Rearrange indices of input array in order to make data to be send
867	!-- by MPI contiguous
868	!$OMP PARALLEL PRIVATE ( i, j, k )
869	!$OMP DO
870	#if __acc_fft_device
871	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
872	!$ACC PRESENT(f_in, f_inv)
873	#endif
874	DO i = nxl, nxr
875	DO j = nys, nyn
876	DO k = 1,nz
877	f_inv(j,i,k) = f_in(k,j,i)
878	ENDDO
879	ENDDO
880	ENDDO
881	!$OMP END PARALLEL
882
883	END SUBROUTINE resort_for_zx
884
885
886	!------------------------------------------------------------------------------!
887	! Description:
888	! ------------
889	!> Transposition of input array (f_in) from z to x. For the input array, all
890	!> elements along z reside on the same PE, while after transposition, all
891	!> elements along x reside on the same PE.
892	!------------------------------------------------------------------------------!
893	SUBROUTINE transpose_zx( f_inv, f_out )
894
895
896	#if defined( __parallel )
897	USE cpulog, &
898	ONLY: cpu_log, cpu_log_nowait, log_point_s
899
900	USE fft_xy, &
901	ONLY: f_vec_x, temperton_fft_vec
902	#endif
903
904	USE indices, &
905	ONLY: nx, nxl, nxr, nyn, nys, nz
906	#if defined( __parallel )
907	USE indices, &
908	ONLY: nnx
909	#endif
910
911	USE kinds
912
913	USE pegrid
914
915	USE transpose_indices, &
916	ONLY: nyn_x, nys_x, nzb_x, nzt_x
917
918	IMPLICIT NONE
919
920	INTEGER(iwp) :: i !<
921	INTEGER(iwp) :: j !<
922	INTEGER(iwp) :: k !<
923	#if defined( __parallel )
924	INTEGER(iwp) :: l !<
925	INTEGER(iwp) :: mm !<
926	INTEGER(iwp) :: xs !<
927	#endif
928
929	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
930	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
931
932	#if defined( __parallel )
933	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
934	#if __acc_fft_device
935	!$ACC DECLARE CREATE(work)
936	#endif
937	#endif
938
939
940	!
941	!-- If the PE grid is one-dimensional along y, only local reordering
942	!-- of the data is necessary and no transposition has to be done.
943	IF ( pdims(1) == 1 ) THEN
944
945	!$OMP PARALLEL PRIVATE ( i, j, k )
946	!$OMP DO
947	#if __acc_fft_device
948	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
949	!$ACC PRESENT(f_out, f_inv)
950	#endif
951	DO k = 1, nz
952	DO i = nxl, nxr
953	DO j = nys, nyn
954	f_out(i,j,k) = f_inv(j,i,k)
955	ENDDO
956	ENDDO
957	ENDDO
958	!$OMP END PARALLEL
959
960	ELSE
961
962	#if defined( __parallel )
963	!
964	!-- Transpose array
965	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
966
967	#if __acc_fft_device
968	#ifndef __cuda_aware_mpi
969	!$ACC UPDATE HOST(f_inv)
970	#else
971	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
972	#endif
973	#endif
974
975	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
976	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
977	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
978	comm1dx, ierr )
979
980	#if __acc_fft_device
981	#ifndef __cuda_aware_mpi
982	!$ACC UPDATE DEVICE(work)
983	#else
984	!$ACC END HOST_DATA
985	#endif
986	#endif
987
988	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
989
990	!
991	!-- Reorder transposed array.
992	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
993	!-- saves additional data copy in fft_x.
994	IF ( temperton_fft_vec ) THEN
995
996	DO l = 0, pdims(1) - 1
997	xs = 0 + l * nnx
998	DO k = nzb_x, nzt_x
999	DO i = xs, xs + nnx - 1
1000	DO j = nys_x, nyn_x
1001	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
1002	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
1003	ENDDO
1004	ENDDO
1005	ENDDO
1006	ENDDO
1007
1008	ELSE
1009
1010	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
1011	DO l = 0, pdims(1) - 1
1012	xs = 0 + l * nnx
1013	#if __acc_fft_device
1014	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1015	!$ACC PRESENT(f_out, work)
1016	#endif
1017	!$OMP DO
1018	DO k = nzb_x, nzt_x
1019	DO i = xs, xs + nnx - 1
1020	DO j = nys_x, nyn_x
1021	f_out(i,j,k) = work(j,i-xs+1,k,l)
1022	ENDDO
1023	ENDDO
1024	ENDDO
1025	!$OMP END DO NOWAIT
1026	ENDDO
1027	!$OMP END PARALLEL
1028
1029	ENDIF
1030
1031	#endif
1032
1033	ENDIF
1034
1035	END SUBROUTINE transpose_zx
1036
1037
1038	!------------------------------------------------------------------------------!
1039	! Description:
1040	! ------------
1041	!> Resorting data after the transposition from z to y. The transposition itself
1042	!> is carried out in transpose_zy
1043	!------------------------------------------------------------------------------!
1044	SUBROUTINE resort_for_zy( f_inv, f_out )
1045
1046
1047	USE indices, &
1048	ONLY: ny
1049
1050	USE kinds
1051
1052	USE transpose_indices, &
1053	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1054
1055	IMPLICIT NONE
1056
1057	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1058	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1059
1060
1061	INTEGER(iwp) :: i !<
1062	INTEGER(iwp) :: j !<
1063	INTEGER(iwp) :: k !<
1064
1065	!
1066	!-- Rearrange indices of input array in order to make data to be send
1067	!-- by MPI contiguous
1068	!$OMP PARALLEL PRIVATE ( i, j, k )
1069	!$OMP DO
1070	#if __acc_fft_device
1071	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1072	!$ACC PRESENT(f_out, f_inv)
1073	#endif
1074	DO k = nzb_y, nzt_y
1075	DO i = nxl_y, nxr_y
1076	DO j = 0, ny
1077	f_out(j,i,k) = f_inv(i,k,j)
1078	ENDDO
1079	ENDDO
1080	ENDDO
1081	!$OMP END PARALLEL
1082
1083	END SUBROUTINE resort_for_zy
1084
1085
1086	!------------------------------------------------------------------------------!
1087	! Description:cpu_log_nowait
1088	! ------------
1089	!> Transposition of input array (f_in) from z to y. For the input array, all
1090	!> elements along z reside on the same PE, while after transposition, all
1091	!> elements along y reside on the same PE.
1092	!------------------------------------------------------------------------------!
1093	SUBROUTINE transpose_zy( f_in, f_inv )
1094
1095
1096	#if defined( __parallel )
1097	USE cpulog, &
1098	ONLY: cpu_log, cpu_log_nowait, log_point_s
1099	#endif
1100
1101	USE indices, &
1102	ONLY: ny, nz
1103
1104	USE kinds
1105
1106	USE pegrid
1107
1108	USE transpose_indices, &
1109	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1110
1111	IMPLICIT NONE
1112
1113	INTEGER(iwp) :: i !<
1114	INTEGER(iwp) :: j !<
1115	INTEGER(iwp) :: k !<
1116	#if defined( __parallel )
1117	INTEGER(iwp) :: l !<
1118	INTEGER(iwp) :: zs !<
1119	#endif
1120
1121	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1122	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1123
1124	#if defined( __parallel )
1125	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1126	#if __acc_fft_device
1127	!$ACC DECLARE CREATE(work)
1128	#endif
1129	#endif
1130
1131	!
1132	!-- If the PE grid is one-dimensional along y, the array has only to be
1133	!-- reordered locally and therefore no transposition has to be done.
1134	IF ( pdims(1) /= 1 ) THEN
1135
1136	#if defined( __parallel )
1137	!
1138	!-- Reorder input array for transposition
1139	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1140	DO l = 0, pdims(1) - 1
1141	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1142	#if __acc_fft_device
1143	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1144	!$ACC PRESENT(work, f_in)
1145	#endif
1146	!$OMP DO
1147	DO j = nys_z, nyn_z
1148	DO k = zs, zs + nzt_y - nzb_y
1149	DO i = nxl_z, nxr_z
1150	work(i,k-zs+1,j,l) = f_in(i,j,k)
1151	ENDDO
1152	ENDDO
1153	ENDDO
1154	!$OMP END DO NOWAIT
1155	ENDDO
1156	!$OMP END PARALLEL
1157
1158	!
1159	!-- Transpose array
1160	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1161
1162	#if __acc_fft_device
1163	#ifndef __cuda_aware_mpi
1164	!$ACC UPDATE HOST(work)
1165	#else
1166	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1167	#endif
1168	#endif
1169
1170	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1171	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1172	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1173	comm1dx, ierr )
1174
1175	#if __acc_fft_device
1176	#ifndef __cuda_aware_mpi
1177	!$ACC UPDATE DEVICE(f_inv)
1178	#else
1179	!$ACC END HOST_DATA
1180	#endif
1181	#endif
1182
1183	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1184	#endif
1185
1186	ELSE
1187	!
1188	!-- Reorder the array in the same way like ALLTOALL did it
1189	!$OMP PARALLEL PRIVATE ( i, j, k )
1190	!$OMP DO
1191	#if __acc_fft_device
1192	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1193	!$ACC PRESENT(f_inv, f_in)
1194	#endif
1195	DO k = nzb_y, nzt_y
1196	DO j = 0, ny
1197	DO i = nxl_y, nxr_y
1198	f_inv(i,k,j) = f_in(i,j,k)
1199	ENDDO
1200	ENDDO
1201	ENDDO
1202	!$OMP END PARALLEL
1203
1204	ENDIF
1205
1206	END SUBROUTINE transpose_zy
1207
1208
1209	!------------------------------------------------------------------------------!
1210	! Description:
1211	! ------------
1212	!> Transposition of input array (f_in) from z to y. For the input array, all
1213	!> elements along z reside on the same PE, while after transposition, all
1214	!> elements along y reside on the same PE.
1215	!> This is a direct transposition for arrays with indices in regular order
1216	!> (k,j,i) (cf. transpose_zy).
1217	!------------------------------------------------------------------------------!
1218	#if defined( __parallel )
1219	SUBROUTINE transpose_zyd( f_in, f_out )
1220
1221
1222	USE cpulog, &
1223	ONLY: cpu_log, log_point_s
1224
1225	USE indices, &
1226	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1227
1228	USE kinds
1229
1230	USE pegrid
1231
1232	USE transpose_indices, &
1233	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1234
1235	IMPLICIT NONE
1236
1237	INTEGER(iwp) :: i !<
1238	INTEGER(iwp) :: j !<
1239	INTEGER(iwp) :: k !<
1240	INTEGER(iwp) :: l !<
1241	INTEGER(iwp) :: m !<
1242	INTEGER(iwp) :: ys !<
1243
1244	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1245	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1246	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1247	REAL(wp) :: work(nnxnnynnz) !<
1248
1249	!
1250	!-- Rearrange indices of input array in order to make data to be send
1251	!-- by MPI contiguous
1252	DO i = nxl, nxr
1253	DO j = nys, nyn
1254	DO k = 1, nz
1255	f_inv(j,i,k) = f_in(k,j,i)
1256	ENDDO
1257	ENDDO
1258	ENDDO
1259
1260	!
1261	!-- Move data to different array, because memory location of work1 is
1262	!-- needed further below (work1 = work2).
1263	!-- If the PE grid is one-dimensional along x, only local reordering
1264	!-- of the data is necessary and no transposition has to be done.
1265	IF ( pdims(2) == 1 ) THEN
1266	DO k = 1, nz
1267	DO i = nxl, nxr
1268	DO j = nys, nyn
1269	f_out(j,i,k) = f_inv(j,i,k)
1270	ENDDO
1271	ENDDO
1272	ENDDO
1273	RETURN
1274	ENDIF
1275
1276	!
1277	!-- Transpose array
1278	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1279	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1280	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1281	work(1), sendrecvcount_zyd, MPI_REAL, &
1282	comm1dy, ierr )
1283	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1284
1285	!
1286	!-- Reorder transposed array
1287	m = 0
1288	DO l = 0, pdims(2) - 1
1289	ys = 0 + l * nny
1290	DO k = nzb_yd, nzt_yd
1291	DO i = nxl_yd, nxr_yd
1292	DO j = ys, ys + nny - 1
1293	m = m + 1
1294	f_out(j,i,k) = work(m)
1295	ENDDO
1296	ENDDO
1297	ENDDO
1298	ENDDO
1299
1300	END SUBROUTINE transpose_zyd
1301	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |