Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4180

Last change on this file since 4180 was 4180, checked in by scharf, 5 years ago
removed comments in 'Former revisions' section that are older than 01.01.2019
Property svn:keywords set to `Id`
File size: 32.0 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4180 2019-08-21 14:37:54Z scharf $
27	! loop reordering for performance optimization
28	!
29	! 3832 2019-03-28 13:16:58Z raasch
30	! loop reordering for performance optimization
31	!
32	! 3694 2019-01-23 17:01:49Z knoop
33	! OpenACC port for SPEC
34	!
35	!------------------------------------------------------------------------------!
36	! Description:
37	! ------------
38	!> Resorting data for the transposition from x to y. The transposition itself
39	!> is carried out in transpose_xy
40	!------------------------------------------------------------------------------!
41	SUBROUTINE resort_for_xy( f_in, f_inv )
42
43
44	USE indices, &
45	ONLY: nx
46
47	USE kinds
48
49	USE transpose_indices, &
50	ONLY: nyn_x, nys_x, nzb_x, nzt_x
51
52	IMPLICIT NONE
53
54	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
55	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
56
57
58	INTEGER(iwp) :: i !<
59	INTEGER(iwp) :: j !<
60	INTEGER(iwp) :: k !<
61	!
62	!-- Rearrange indices of input array in order to make data to be send
63	!-- by MPI contiguous
64	!$OMP PARALLEL PRIVATE ( i, j, k )
65	!$OMP DO
66	#if __acc_fft_device
67	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
68	!$ACC PRESENT(f_inv, f_in)
69	#endif
70	DO k = nzb_x, nzt_x
71	DO j = nys_x, nyn_x
72	DO i = 0, nx
73	f_inv(j,k,i) = f_in(i,j,k)
74	ENDDO
75	ENDDO
76	ENDDO
77	!$OMP END PARALLEL
78
79	END SUBROUTINE resort_for_xy
80
81
82	!------------------------------------------------------------------------------!
83	! Description:
84	! ------------
85	!> Transposition of input array (f_in) from x to y. For the input array, all
86	!> elements along x reside on the same PE, while after transposition, all
87	!> elements along y reside on the same PE.
88	!------------------------------------------------------------------------------!
89	SUBROUTINE transpose_xy( f_inv, f_out )
90
91
92	USE cpulog, &
93	ONLY: cpu_log, cpu_log_nowait, log_point_s
94
95	USE indices, &
96	ONLY: nx, ny
97
98	USE kinds
99
100	USE pegrid
101
102	USE transpose_indices, &
103	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
104
105	IMPLICIT NONE
106
107	INTEGER(iwp) :: i !<
108	INTEGER(iwp) :: j !<
109	INTEGER(iwp) :: k !<
110	INTEGER(iwp) :: l !<
111	INTEGER(iwp) :: ys !<
112
113	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
114	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
115
116	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
117	#if __acc_fft_device
118	!$ACC DECLARE CREATE(work)
119	#endif
120
121
122	IF ( numprocs /= 1 ) THEN
123
124	#if defined( __parallel )
125	!
126	!-- Transpose array
127	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
128
129	#if __acc_fft_device
130	#ifndef __cuda_aware_mpi
131	!$ACC UPDATE HOST(f_inv)
132	#else
133	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
134	#endif
135	#endif
136
137	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
138	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
139	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
140	comm1dy, ierr )
141
142	#if __acc_fft_device
143	#ifndef __cuda_aware_mpi
144	!$ACC UPDATE DEVICE(work)
145	#else
146	!$ACC END HOST_DATA
147	#endif
148	#endif
149
150	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
151
152	!
153	!-- Reorder transposed array
154	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
155	!$OMP DO
156	DO l = 0, pdims(2) - 1
157	ys = 0 + l * ( nyn_x - nys_x + 1 )
158	#if __acc_fft_device
159	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
160	!$ACC PRESENT(f_out, work)
161	#endif
162	DO i = nxl_y, nxr_y
163	DO k = nzb_y, nzt_y
164	DO j = ys, ys + nyn_x - nys_x
165	f_out(j,i,k) = work(j-ys+1,k,i,l)
166	ENDDO
167	ENDDO
168	ENDDO
169	ENDDO
170	!$OMP END PARALLEL
171	#endif
172
173	ELSE
174
175	!
176	!-- Reorder transposed array
177	!$OMP PARALLEL PRIVATE ( i, j, k )
178	!$OMP DO
179	#if __acc_fft_device
180	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
181	!$ACC PRESENT(f_out, f_inv)
182	#endif
183	DO k = nzb_y, nzt_y
184	DO i = nxl_y, nxr_y
185	DO j = 0, ny
186	f_out(j,i,k) = f_inv(j,k,i)
187	ENDDO
188	ENDDO
189	ENDDO
190	!$OMP END PARALLEL
191
192	ENDIF
193
194	END SUBROUTINE transpose_xy
195
196
197	!------------------------------------------------------------------------------!
198	! Description:
199	! ------------
200	!> Resorting data after the transposition from x to z. The transposition itself
201	!> is carried out in transpose_xz
202	!------------------------------------------------------------------------------!
203	SUBROUTINE resort_for_xz( f_inv, f_out )
204
205
206	USE indices, &
207	ONLY: nxl, nxr, nyn, nys, nz
208
209	USE kinds
210
211	IMPLICIT NONE
212
213	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
214	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
215
216	INTEGER(iwp) :: i !<
217	INTEGER(iwp) :: j !<
218	INTEGER(iwp) :: k !<
219	!
220	!-- Rearrange indices of input array in order to make data to be send
221	!-- by MPI contiguous.
222	!-- In case of parallel fft/transposition, scattered store is faster in
223	!-- backward direction!!!
224	!$OMP PARALLEL PRIVATE ( i, j, k )
225	!$OMP DO
226	#if __acc_fft_device
227	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
228	!$ACC PRESENT(f_out, f_inv)
229	#endif
230	DO i = nxl, nxr
231	DO j = nys, nyn
232	DO k = 1, nz
233	f_out(k,j,i) = f_inv(j,i,k)
234	ENDDO
235	ENDDO
236	ENDDO
237	!$OMP END PARALLEL
238
239	END SUBROUTINE resort_for_xz
240
241
242	!------------------------------------------------------------------------------!
243	! Description:
244	! ------------
245	!> Transposition of input array (f_in) from x to z. For the input array, all
246	!> elements along x reside on the same PE, while after transposition, all
247	!> elements along z reside on the same PE.
248	!------------------------------------------------------------------------------!
249	SUBROUTINE transpose_xz( f_in, f_inv )
250
251
252	USE cpulog, &
253	ONLY: cpu_log, cpu_log_nowait, log_point_s
254
255	USE indices, &
256	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
257
258	USE kinds
259
260	USE pegrid
261
262	USE transpose_indices, &
263	ONLY: nyn_x, nys_x, nzb_x, nzt_x
264
265	IMPLICIT NONE
266
267	INTEGER(iwp) :: i !<
268	INTEGER(iwp) :: j !<
269	INTEGER(iwp) :: k !<
270	INTEGER(iwp) :: l !<
271	INTEGER(iwp) :: xs !<
272
273	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
274	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
275
276	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
277	#if __acc_fft_device
278	!$ACC DECLARE CREATE(work)
279	#endif
280
281
282	!
283	!-- If the PE grid is one-dimensional along y, the array has only to be
284	!-- reordered locally and therefore no transposition has to be done.
285	IF ( pdims(1) /= 1 ) THEN
286
287	#if defined( __parallel )
288	!
289	!-- Reorder input array for transposition
290	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
291	!$OMP DO
292	DO l = 0, pdims(1) - 1
293	xs = 0 + l * nnx
294	#if __acc_fft_device
295	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
296	!$ACC PRESENT(work, f_in)
297	#endif
298	DO k = nzb_x, nzt_x
299	DO i = xs, xs + nnx - 1
300	DO j = nys_x, nyn_x
301	work(j,i-xs+1,k,l) = f_in(i,j,k)
302	ENDDO
303	ENDDO
304	ENDDO
305	ENDDO
306	!$OMP END PARALLEL
307
308	!
309	!-- Transpose array
310	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
311
312	#if __acc_fft_device
313	#ifndef __cuda_aware_mpi
314	!$ACC UPDATE HOST(work)
315	#else
316	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
317	#endif
318	#endif
319
320	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
321	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
322	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
323	comm1dx, ierr )
324
325	#if __acc_fft_device
326	#ifndef __cuda_aware_mpi
327	!$ACC UPDATE DEVICE(f_inv)
328	#else
329	!$ACC END HOST_DATA
330	#endif
331	#endif
332
333	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
334	#endif
335
336	ELSE
337
338	!
339	!-- Reorder the array in a way that the z index is in first position
340	!$OMP PARALLEL PRIVATE ( i, j, k )
341	!$OMP DO
342	#if __acc_fft_device
343	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
344	!$ACC PRESENT(f_inv, f_in)
345	#endif
346	DO i = nxl, nxr
347	DO j = nys, nyn
348	DO k = 1, nz
349	f_inv(j,i,k) = f_in(i,j,k)
350	ENDDO
351	ENDDO
352	ENDDO
353	!$OMP END PARALLEL
354
355	ENDIF
356
357	END SUBROUTINE transpose_xz
358
359
360	!------------------------------------------------------------------------------!
361	! Description:
362	! ------------
363	!> Resorting data after the transposition from y to x. The transposition itself
364	!> is carried out in transpose_yx
365	!------------------------------------------------------------------------------!
366	SUBROUTINE resort_for_yx( f_inv, f_out )
367
368
369	USE indices, &
370	ONLY: nx
371
372	USE kinds
373
374	USE transpose_indices, &
375	ONLY: nyn_x, nys_x, nzb_x, nzt_x
376
377	IMPLICIT NONE
378
379	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
380	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
381
382
383	INTEGER(iwp) :: i !<
384	INTEGER(iwp) :: j !<
385	INTEGER(iwp) :: k !<
386	!
387	!-- Rearrange indices of input array in order to make data to be send
388	!-- by MPI contiguous
389	!$OMP PARALLEL PRIVATE ( i, j, k )
390	!$OMP DO
391	#if __acc_fft_device
392	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
393	!$ACC PRESENT(f_out, f_inv)
394	#endif
395	DO k = nzb_x, nzt_x
396	DO j = nys_x, nyn_x
397	DO i = 0, nx
398	f_out(i,j,k) = f_inv(j,k,i)
399	ENDDO
400	ENDDO
401	ENDDO
402	!$OMP END PARALLEL
403
404	END SUBROUTINE resort_for_yx
405
406
407	!------------------------------------------------------------------------------!
408	! Description:
409	! ------------
410	!> Transposition of input array (f_in) from y to x. For the input array, all
411	!> elements along y reside on the same PE, while after transposition, all
412	!> elements along x reside on the same PE.
413	!------------------------------------------------------------------------------!
414	SUBROUTINE transpose_yx( f_in, f_inv )
415
416
417	USE cpulog, &
418	ONLY: cpu_log, cpu_log_nowait, log_point_s
419
420	USE indices, &
421	ONLY: nx, ny
422
423	USE kinds
424
425	USE pegrid
426
427	USE transpose_indices, &
428	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
429
430	IMPLICIT NONE
431
432	INTEGER(iwp) :: i !<
433	INTEGER(iwp) :: j !<
434	INTEGER(iwp) :: k !<
435	INTEGER(iwp) :: l !<
436	INTEGER(iwp) :: ys !<
437
438	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
439	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
440
441	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
442	#if __acc_fft_device
443	!$ACC DECLARE CREATE(work)
444	#endif
445
446
447	IF ( numprocs /= 1 ) THEN
448
449	#if defined( __parallel )
450	!
451	!-- Reorder input array for transposition
452	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
453	!$OMP DO
454	DO l = 0, pdims(2) - 1
455	ys = 0 + l * ( nyn_x - nys_x + 1 )
456	#if __acc_fft_device
457	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
458	!$ACC PRESENT(work, f_in)
459	#endif
460	DO i = nxl_y, nxr_y
461	DO k = nzb_y, nzt_y
462	DO j = ys, ys + nyn_x - nys_x
463	work(j-ys+1,k,i,l) = f_in(j,i,k)
464	ENDDO
465	ENDDO
466	ENDDO
467	ENDDO
468	!$OMP END PARALLEL
469
470	!
471	!-- Transpose array
472	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
473
474	#if __acc_fft_device
475	#ifndef __cuda_aware_mpi
476	!$ACC UPDATE HOST(work)
477	#else
478	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
479	#endif
480	#endif
481
482	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
483	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
484	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
485	comm1dy, ierr )
486
487	#if __acc_fft_device
488	#ifndef __cuda_aware_mpi
489	!$ACC UPDATE DEVICE(f_inv)
490	#else
491	!$ACC END HOST_DATA
492	#endif
493	#endif
494
495	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
496	#endif
497
498	ELSE
499
500	!
501	!-- Reorder array f_in the same way as ALLTOALL did it
502	!$OMP PARALLEL PRIVATE ( i, j, k )
503	!$OMP DO
504	#if __acc_fft_device
505	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
506	!$ACC PRESENT(f_inv, f_in)
507	#endif
508	DO i = nxl_y, nxr_y
509	DO k = nzb_y, nzt_y
510	DO j = 0, ny
511	f_inv(j,k,i) = f_in(j,i,k)
512	ENDDO
513	ENDDO
514	ENDDO
515	!$OMP END PARALLEL
516
517	ENDIF
518
519	END SUBROUTINE transpose_yx
520
521
522	!------------------------------------------------------------------------------!
523	! Description:
524	! ------------
525	!> Transposition of input array (f_in) from y to x. For the input array, all
526	!> elements along y reside on the same PE, while after transposition, all
527	!> elements along x reside on the same PE.
528	!> This is a direct transposition for arrays with indices in regular order
529	!> (k,j,i) (cf. transpose_yx).
530	!------------------------------------------------------------------------------!
531	SUBROUTINE transpose_yxd( f_in, f_out )
532
533
534	USE cpulog, &
535	ONLY: cpu_log, log_point_s
536
537	USE indices, &
538	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
539
540	USE kinds
541
542	USE pegrid
543
544	USE transpose_indices, &
545	ONLY: nyn_x, nys_x, nzb_x, nzt_x
546
547	IMPLICIT NONE
548
549	INTEGER(iwp) :: i !<
550	INTEGER(iwp) :: j !<
551	INTEGER(iwp) :: k !<
552	INTEGER(iwp) :: l !<
553	INTEGER(iwp) :: m !<
554	INTEGER(iwp) :: xs !<
555
556	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
557	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
558	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
559	REAL(wp) :: work(nnxnnynnz) !<
560	#if defined( __parallel )
561
562	!
563	!-- Rearrange indices of input array in order to make data to be send
564	!-- by MPI contiguous
565	DO k = 1, nz
566	DO j = nys, nyn
567	DO i = nxl, nxr
568	f_inv(i,k,j) = f_in(k,j,i)
569	ENDDO
570	ENDDO
571	ENDDO
572
573	!
574	!-- Transpose array
575	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
576	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
577	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
578	work(1), sendrecvcount_xy, MPI_REAL, &
579	comm1dx, ierr )
580	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
581
582	!
583	!-- Reorder transposed array
584	m = 0
585	DO l = 0, pdims(1) - 1
586	xs = 0 + l * nnx
587	DO j = nys_x, nyn_x
588	DO k = 1, nz
589	DO i = xs, xs + nnx - 1
590	m = m + 1
591	f_out(i,j,k) = work(m)
592	ENDDO
593	ENDDO
594	ENDDO
595	ENDDO
596
597	#endif
598
599	END SUBROUTINE transpose_yxd
600
601
602	!------------------------------------------------------------------------------!
603	! Description:
604	! ------------
605	!> Resorting data for the transposition from y to z. The transposition itself
606	!> is carried out in transpose_yz
607	!------------------------------------------------------------------------------!
608	SUBROUTINE resort_for_yz( f_in, f_inv )
609
610
611	USE indices, &
612	ONLY: ny
613
614	USE kinds
615
616	USE transpose_indices, &
617	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
618
619	IMPLICIT NONE
620
621	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
622	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
623
624	INTEGER(iwp) :: i !<
625	INTEGER(iwp) :: j !<
626	INTEGER(iwp) :: k !<
627
628	!
629	!-- Rearrange indices of input array in order to make data to be send
630	!-- by MPI contiguous
631	!$OMP PARALLEL PRIVATE ( i, j, k )
632	!$OMP DO
633	#if __acc_fft_device
634	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
635	!$ACC PRESENT(f_inv, f_in)
636	#endif
637	DO k = nzb_y, nzt_y
638	DO i = nxl_y, nxr_y
639	DO j = 0, ny
640	f_inv(i,k,j) = f_in(j,i,k)
641	ENDDO
642	ENDDO
643	ENDDO
644	!$OMP END PARALLEL
645
646	END SUBROUTINE resort_for_yz
647
648
649	!------------------------------------------------------------------------------!
650	! Description:
651	! ------------
652	!> Transposition of input array (f_in) from y to z. For the input array, all
653	!> elements along y reside on the same PE, while after transposition, all
654	!> elements along z reside on the same PE.
655	!------------------------------------------------------------------------------!
656	SUBROUTINE transpose_yz( f_inv, f_out )
657
658
659	USE cpulog, &
660	ONLY: cpu_log, cpu_log_nowait, log_point_s
661
662	USE indices, &
663	ONLY: ny, nz
664
665	USE kinds
666
667	USE pegrid
668
669	USE transpose_indices, &
670	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
671
672	IMPLICIT NONE
673
674	INTEGER(iwp) :: i !<
675	INTEGER(iwp) :: j !<
676	INTEGER(iwp) :: k !<
677	INTEGER(iwp) :: l !<
678	INTEGER(iwp) :: zs !<
679
680	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
681	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
682
683	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
684	#if __acc_fft_device
685	!$ACC DECLARE CREATE(work)
686	#endif
687
688
689	!
690	!-- If the PE grid is one-dimensional along y, only local reordering
691	!-- of the data is necessary and no transposition has to be done.
692	IF ( pdims(1) == 1 ) THEN
693
694	!$OMP PARALLEL PRIVATE ( i, j, k )
695	!$OMP DO
696	#if __acc_fft_device
697	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
698	!$ACC PRESENT(f_out, f_inv)
699	#endif
700	DO j = 0, ny
701	DO k = nzb_y, nzt_y
702	DO i = nxl_y, nxr_y
703	f_out(i,j,k) = f_inv(i,k,j)
704	ENDDO
705	ENDDO
706	ENDDO
707	!$OMP END PARALLEL
708
709	ELSE
710
711	#if defined( __parallel )
712	!
713	!-- Transpose array
714	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
715
716	#if __acc_fft_device
717	#ifndef __cuda_aware_mpi
718	!$ACC UPDATE HOST(f_inv)
719	#else
720	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
721	#endif
722	#endif
723
724	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
725	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
726	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
727	comm1dx, ierr )
728
729	#if __acc_fft_device
730	#ifndef __cuda_aware_mpi
731	!$ACC UPDATE DEVICE(work)
732	#else
733	!$ACC END HOST_DATA
734	#endif
735	#endif
736
737	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
738
739	!
740	!-- Reorder transposed array
741	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
742	!$OMP DO
743	DO l = 0, pdims(1) - 1
744	zs = 1 + l * ( nzt_y - nzb_y + 1 )
745	#if __acc_fft_device
746	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
747	!$ACC PRESENT(f_out, work)
748	#endif
749	DO j = nys_z, nyn_z
750	DO k = zs, zs + nzt_y - nzb_y
751	DO i = nxl_z, nxr_z
752	f_out(i,j,k) = work(i,k-zs+1,j,l)
753	ENDDO
754	ENDDO
755	ENDDO
756	ENDDO
757	!$OMP END PARALLEL
758	#endif
759
760	ENDIF
761
762	END SUBROUTINE transpose_yz
763
764
765	!------------------------------------------------------------------------------!
766	! Description:
767	! ------------
768	!> Resorting data for the transposition from z to x. The transposition itself
769	!> is carried out in transpose_zx
770	!------------------------------------------------------------------------------!
771	SUBROUTINE resort_for_zx( f_in, f_inv )
772
773
774	USE indices, &
775	ONLY: nxl, nxr, nyn, nys, nz
776
777	USE kinds
778
779	IMPLICIT NONE
780
781	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
782	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
783
784	INTEGER(iwp) :: i !<
785	INTEGER(iwp) :: j !<
786	INTEGER(iwp) :: k !<
787
788	!
789	!-- Rearrange indices of input array in order to make data to be send
790	!-- by MPI contiguous
791	!$OMP PARALLEL PRIVATE ( i, j, k )
792	!$OMP DO
793	#if __acc_fft_device
794	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
795	!$ACC PRESENT(f_in, f_inv)
796	#endif
797	DO i = nxl, nxr
798	DO j = nys, nyn
799	DO k = 1,nz
800	f_inv(j,i,k) = f_in(k,j,i)
801	ENDDO
802	ENDDO
803	ENDDO
804	!$OMP END PARALLEL
805
806	END SUBROUTINE resort_for_zx
807
808
809	!------------------------------------------------------------------------------!
810	! Description:
811	! ------------
812	!> Transposition of input array (f_in) from z to x. For the input array, all
813	!> elements along z reside on the same PE, while after transposition, all
814	!> elements along x reside on the same PE.
815	!------------------------------------------------------------------------------!
816	SUBROUTINE transpose_zx( f_inv, f_out )
817
818
819	USE cpulog, &
820	ONLY: cpu_log, cpu_log_nowait, log_point_s
821
822	USE indices, &
823	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
824
825	USE kinds
826
827	USE pegrid
828
829	USE transpose_indices, &
830	ONLY: nyn_x, nys_x, nzb_x, nzt_x
831
832	IMPLICIT NONE
833
834	INTEGER(iwp) :: i !<
835	INTEGER(iwp) :: j !<
836	INTEGER(iwp) :: k !<
837	INTEGER(iwp) :: l !<
838	INTEGER(iwp) :: xs !<
839
840	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
841	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
842
843	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
844	#if __acc_fft_device
845	!$ACC DECLARE CREATE(work)
846	#endif
847
848
849	!
850	!-- If the PE grid is one-dimensional along y, only local reordering
851	!-- of the data is necessary and no transposition has to be done.
852	IF ( pdims(1) == 1 ) THEN
853
854	!$OMP PARALLEL PRIVATE ( i, j, k )
855	!$OMP DO
856	#if __acc_fft_device
857	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
858	!$ACC PRESENT(f_out, f_inv)
859	#endif
860	DO k = 1, nz
861	DO i = nxl, nxr
862	DO j = nys, nyn
863	f_out(i,j,k) = f_inv(j,i,k)
864	ENDDO
865	ENDDO
866	ENDDO
867	!$OMP END PARALLEL
868
869	ELSE
870
871	#if defined( __parallel )
872	!
873	!-- Transpose array
874	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
875
876	#if __acc_fft_device
877	#ifndef __cuda_aware_mpi
878	!$ACC UPDATE HOST(f_inv)
879	#else
880	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
881	#endif
882	#endif
883
884	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
885	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
886	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
887	comm1dx, ierr )
888
889	#if __acc_fft_device
890	#ifndef __cuda_aware_mpi
891	!$ACC UPDATE DEVICE(work)
892	#else
893	!$ACC END HOST_DATA
894	#endif
895	#endif
896
897	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
898
899	!
900	!-- Reorder transposed array
901	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
902	!$OMP DO
903	DO l = 0, pdims(1) - 1
904	xs = 0 + l * nnx
905	#if __acc_fft_device
906	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
907	!$ACC PRESENT(f_out, work)
908	#endif
909	DO k = nzb_x, nzt_x
910	DO i = xs, xs + nnx - 1
911	DO j = nys_x, nyn_x
912	f_out(i,j,k) = work(j,i-xs+1,k,l)
913	ENDDO
914	ENDDO
915	ENDDO
916	ENDDO
917	!$OMP END PARALLEL
918	#endif
919
920	ENDIF
921
922	END SUBROUTINE transpose_zx
923
924
925	!------------------------------------------------------------------------------!
926	! Description:
927	! ------------
928	!> Resorting data after the transposition from z to y. The transposition itself
929	!> is carried out in transpose_zy
930	!------------------------------------------------------------------------------!
931	SUBROUTINE resort_for_zy( f_inv, f_out )
932
933
934	USE indices, &
935	ONLY: ny
936
937	USE kinds
938
939	USE transpose_indices, &
940	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
941
942	IMPLICIT NONE
943
944	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
945	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
946
947
948	INTEGER(iwp) :: i !<
949	INTEGER(iwp) :: j !<
950	INTEGER(iwp) :: k !<
951
952	!
953	!-- Rearrange indices of input array in order to make data to be send
954	!-- by MPI contiguous
955	!$OMP PARALLEL PRIVATE ( i, j, k )
956	!$OMP DO
957	#if __acc_fft_device
958	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
959	!$ACC PRESENT(f_out, f_inv)
960	#endif
961	DO k = nzb_y, nzt_y
962	DO i = nxl_y, nxr_y
963	DO j = 0, ny
964	f_out(j,i,k) = f_inv(i,k,j)
965	ENDDO
966	ENDDO
967	ENDDO
968	!$OMP END PARALLEL
969
970	END SUBROUTINE resort_for_zy
971
972
973	!------------------------------------------------------------------------------!
974	! Description:cpu_log_nowait
975	! ------------
976	!> Transposition of input array (f_in) from z to y. For the input array, all
977	!> elements along z reside on the same PE, while after transposition, all
978	!> elements along y reside on the same PE.
979	!------------------------------------------------------------------------------!
980	SUBROUTINE transpose_zy( f_in, f_inv )
981
982
983	USE cpulog, &
984	ONLY: cpu_log, cpu_log_nowait, log_point_s
985
986	USE indices, &
987	ONLY: ny, nz
988
989	USE kinds
990
991	USE pegrid
992
993	USE transpose_indices, &
994	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
995
996	IMPLICIT NONE
997
998	INTEGER(iwp) :: i !<
999	INTEGER(iwp) :: j !<
1000	INTEGER(iwp) :: k !<
1001	INTEGER(iwp) :: l !<
1002	INTEGER(iwp) :: zs !<
1003
1004	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1005	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1006
1007	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1008	#if __acc_fft_device
1009	!$ACC DECLARE CREATE(work)
1010	#endif
1011
1012	!
1013	!-- If the PE grid is one-dimensional along y, the array has only to be
1014	!-- reordered locally and therefore no transposition has to be done.
1015	IF ( pdims(1) /= 1 ) THEN
1016
1017	#if defined( __parallel )
1018	!
1019	!-- Reorder input array for transposition
1020	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1021	!$OMP DO
1022	DO l = 0, pdims(1) - 1
1023	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1024	#if __acc_fft_device
1025	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1026	!$ACC PRESENT(work, f_in)
1027	#endif
1028	DO j = nys_z, nyn_z
1029	DO k = zs, zs + nzt_y - nzb_y
1030	DO i = nxl_z, nxr_z
1031	work(i,k-zs+1,j,l) = f_in(i,j,k)
1032	ENDDO
1033	ENDDO
1034	ENDDO
1035	ENDDO
1036	!$OMP END PARALLEL
1037
1038	!
1039	!-- Transpose array
1040	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1041
1042	#if __acc_fft_device
1043	#ifndef __cuda_aware_mpi
1044	!$ACC UPDATE HOST(work)
1045	#else
1046	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1047	#endif
1048	#endif
1049
1050	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1051	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1052	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1053	comm1dx, ierr )
1054
1055	#if __acc_fft_device
1056	#ifndef __cuda_aware_mpi
1057	!$ACC UPDATE DEVICE(f_inv)
1058	#else
1059	!$ACC END HOST_DATA
1060	#endif
1061	#endif
1062
1063	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1064	#endif
1065
1066	ELSE
1067	!
1068	!-- Reorder the array in the same way like ALLTOALL did it
1069	!$OMP PARALLEL PRIVATE ( i, j, k )
1070	!$OMP DO
1071	#if __acc_fft_device
1072	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1073	!$ACC PRESENT(f_inv, f_in)
1074	#endif
1075	DO k = nzb_y, nzt_y
1076	DO j = 0, ny
1077	DO i = nxl_y, nxr_y
1078	f_inv(i,k,j) = f_in(i,j,k)
1079	ENDDO
1080	ENDDO
1081	ENDDO
1082	!$OMP END PARALLEL
1083
1084	ENDIF
1085
1086	END SUBROUTINE transpose_zy
1087
1088
1089	!------------------------------------------------------------------------------!
1090	! Description:
1091	! ------------
1092	!> Transposition of input array (f_in) from z to y. For the input array, all
1093	!> elements along z reside on the same PE, while after transposition, all
1094	!> elements along y reside on the same PE.
1095	!> This is a direct transposition for arrays with indices in regular order
1096	!> (k,j,i) (cf. transpose_zy).
1097	!------------------------------------------------------------------------------!
1098	SUBROUTINE transpose_zyd( f_in, f_out )
1099
1100
1101	USE cpulog, &
1102	ONLY: cpu_log, log_point_s
1103
1104	USE indices, &
1105	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1106
1107	USE kinds
1108
1109	USE pegrid
1110
1111	USE transpose_indices, &
1112	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1113
1114	IMPLICIT NONE
1115
1116	INTEGER(iwp) :: i !<
1117	INTEGER(iwp) :: j !<
1118	INTEGER(iwp) :: k !<
1119	INTEGER(iwp) :: l !<
1120	INTEGER(iwp) :: m !<
1121	INTEGER(iwp) :: ys !<
1122
1123	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1124	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1125	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1126	REAL(wp) :: work(nnxnnynnz) !<
1127
1128	#if defined( __parallel )
1129
1130	!
1131	!-- Rearrange indices of input array in order to make data to be send
1132	!-- by MPI contiguous
1133	DO i = nxl, nxr
1134	DO j = nys, nyn
1135	DO k = 1, nz
1136	f_inv(j,i,k) = f_in(k,j,i)
1137	ENDDO
1138	ENDDO
1139	ENDDO
1140
1141	!
1142	!-- Move data to different array, because memory location of work1 is
1143	!-- needed further below (work1 = work2).
1144	!-- If the PE grid is one-dimensional along x, only local reordering
1145	!-- of the data is necessary and no transposition has to be done.
1146	IF ( pdims(2) == 1 ) THEN
1147	DO k = 1, nz
1148	DO i = nxl, nxr
1149	DO j = nys, nyn
1150	f_out(j,i,k) = f_inv(j,i,k)
1151	ENDDO
1152	ENDDO
1153	ENDDO
1154	RETURN
1155	ENDIF
1156
1157	!
1158	!-- Transpose array
1159	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1160	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1161	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1162	work(1), sendrecvcount_zyd, MPI_REAL, &
1163	comm1dy, ierr )
1164	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1165
1166	!
1167	!-- Reorder transposed array
1168	m = 0
1169	DO l = 0, pdims(2) - 1
1170	ys = 0 + l * nny
1171	DO k = nzb_yd, nzt_yd
1172	DO i = nxl_yd, nxr_yd
1173	DO j = ys, ys + nny - 1
1174	m = m + 1
1175	f_out(j,i,k) = work(m)
1176	ENDDO
1177	ENDDO
1178	ENDDO
1179	ENDDO
1180
1181	#endif
1182
1183	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |