Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4182

Last change on this file since 4182 was 4182, checked in by scharf, 5 years ago
corrected "Former revisions" section minor formatting in "Former revisions" section added "Author" section
Property svn:keywords set to `Id`
File size: 32.1 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4182 2019-08-22 15:20:23Z scharf $
27	! Corrected "Former revisions" section
28	!
29	! 4171 2019-08-19 17:44:09Z gronemeier
30	! loop reordering for performance optimization
31	!
32	! 3832 2019-03-28 13:16:58Z raasch
33	! loop reordering for performance optimization
34	!
35	! 3694 2019-01-23 17:01:49Z knoop
36	! OpenACC port for SPEC
37	!
38	! Revision 1.1 1997/07/24 11:25:18 raasch
39	! Initial revision
40	!
41	!
42	! Description:
43	! ------------
44	!> Resorting data for the transposition from x to y. The transposition itself
45	!> is carried out in transpose_xy
46	!------------------------------------------------------------------------------!
47
48	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
49
50	SUBROUTINE resort_for_xy( f_in, f_inv )
51
52
53	USE indices, &
54	ONLY: nx
55
56	USE kinds
57
58	USE transpose_indices, &
59	ONLY: nyn_x, nys_x, nzb_x, nzt_x
60
61	IMPLICIT NONE
62
63	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
64	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
65
66
67	INTEGER(iwp) :: i !<
68	INTEGER(iwp) :: j !<
69	INTEGER(iwp) :: k !<
70	!
71	!-- Rearrange indices of input array in order to make data to be send
72	!-- by MPI contiguous
73	!$OMP PARALLEL PRIVATE ( i, j, k )
74	!$OMP DO
75	#if __acc_fft_device
76	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
77	!$ACC PRESENT(f_inv, f_in)
78	#endif
79	DO k = nzb_x, nzt_x
80	DO j = nys_x, nyn_x
81	DO i = 0, nx
82	f_inv(j,k,i) = f_in(i,j,k)
83	ENDDO
84	ENDDO
85	ENDDO
86	!$OMP END PARALLEL
87
88	END SUBROUTINE resort_for_xy
89
90
91	!------------------------------------------------------------------------------!
92	! Description:
93	! ------------
94	!> Transposition of input array (f_in) from x to y. For the input array, all
95	!> elements along x reside on the same PE, while after transposition, all
96	!> elements along y reside on the same PE.
97	!------------------------------------------------------------------------------!
98	SUBROUTINE transpose_xy( f_inv, f_out )
99
100
101	USE cpulog, &
102	ONLY: cpu_log, cpu_log_nowait, log_point_s
103
104	USE indices, &
105	ONLY: nx, ny
106
107	USE kinds
108
109	USE pegrid
110
111	USE transpose_indices, &
112	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
113
114	IMPLICIT NONE
115
116	INTEGER(iwp) :: i !<
117	INTEGER(iwp) :: j !<
118	INTEGER(iwp) :: k !<
119	INTEGER(iwp) :: l !<
120	INTEGER(iwp) :: ys !<
121
122	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
123	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
124
125	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
126	#if __acc_fft_device
127	!$ACC DECLARE CREATE(work)
128	#endif
129
130
131	IF ( numprocs /= 1 ) THEN
132
133	#if defined( __parallel )
134	!
135	!-- Transpose array
136	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
137
138	#if __acc_fft_device
139	#ifndef __cuda_aware_mpi
140	!$ACC UPDATE HOST(f_inv)
141	#else
142	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
143	#endif
144	#endif
145
146	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
147	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
148	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
149	comm1dy, ierr )
150
151	#if __acc_fft_device
152	#ifndef __cuda_aware_mpi
153	!$ACC UPDATE DEVICE(work)
154	#else
155	!$ACC END HOST_DATA
156	#endif
157	#endif
158
159	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
160
161	!
162	!-- Reorder transposed array
163	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
164	!$OMP DO
165	DO l = 0, pdims(2) - 1
166	ys = 0 + l * ( nyn_x - nys_x + 1 )
167	#if __acc_fft_device
168	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
169	!$ACC PRESENT(f_out, work)
170	#endif
171	DO i = nxl_y, nxr_y
172	DO k = nzb_y, nzt_y
173	DO j = ys, ys + nyn_x - nys_x
174	f_out(j,i,k) = work(j-ys+1,k,i,l)
175	ENDDO
176	ENDDO
177	ENDDO
178	ENDDO
179	!$OMP END PARALLEL
180	#endif
181
182	ELSE
183
184	!
185	!-- Reorder transposed array
186	!$OMP PARALLEL PRIVATE ( i, j, k )
187	!$OMP DO
188	#if __acc_fft_device
189	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
190	!$ACC PRESENT(f_out, f_inv)
191	#endif
192	DO k = nzb_y, nzt_y
193	DO i = nxl_y, nxr_y
194	DO j = 0, ny
195	f_out(j,i,k) = f_inv(j,k,i)
196	ENDDO
197	ENDDO
198	ENDDO
199	!$OMP END PARALLEL
200
201	ENDIF
202
203	END SUBROUTINE transpose_xy
204
205
206	!------------------------------------------------------------------------------!
207	! Description:
208	! ------------
209	!> Resorting data after the transposition from x to z. The transposition itself
210	!> is carried out in transpose_xz
211	!------------------------------------------------------------------------------!
212	SUBROUTINE resort_for_xz( f_inv, f_out )
213
214
215	USE indices, &
216	ONLY: nxl, nxr, nyn, nys, nz
217
218	USE kinds
219
220	IMPLICIT NONE
221
222	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
223	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
224
225	INTEGER(iwp) :: i !<
226	INTEGER(iwp) :: j !<
227	INTEGER(iwp) :: k !<
228	!
229	!-- Rearrange indices of input array in order to make data to be send
230	!-- by MPI contiguous.
231	!-- In case of parallel fft/transposition, scattered store is faster in
232	!-- backward direction!!!
233	!$OMP PARALLEL PRIVATE ( i, j, k )
234	!$OMP DO
235	#if __acc_fft_device
236	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
237	!$ACC PRESENT(f_out, f_inv)
238	#endif
239	DO i = nxl, nxr
240	DO j = nys, nyn
241	DO k = 1, nz
242	f_out(k,j,i) = f_inv(j,i,k)
243	ENDDO
244	ENDDO
245	ENDDO
246	!$OMP END PARALLEL
247
248	END SUBROUTINE resort_for_xz
249
250
251	!------------------------------------------------------------------------------!
252	! Description:
253	! ------------
254	!> Transposition of input array (f_in) from x to z. For the input array, all
255	!> elements along x reside on the same PE, while after transposition, all
256	!> elements along z reside on the same PE.
257	!------------------------------------------------------------------------------!
258	SUBROUTINE transpose_xz( f_in, f_inv )
259
260
261	USE cpulog, &
262	ONLY: cpu_log, cpu_log_nowait, log_point_s
263
264	USE indices, &
265	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
266
267	USE kinds
268
269	USE pegrid
270
271	USE transpose_indices, &
272	ONLY: nyn_x, nys_x, nzb_x, nzt_x
273
274	IMPLICIT NONE
275
276	INTEGER(iwp) :: i !<
277	INTEGER(iwp) :: j !<
278	INTEGER(iwp) :: k !<
279	INTEGER(iwp) :: l !<
280	INTEGER(iwp) :: xs !<
281
282	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
283	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
284
285	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
286	#if __acc_fft_device
287	!$ACC DECLARE CREATE(work)
288	#endif
289
290
291	!
292	!-- If the PE grid is one-dimensional along y, the array has only to be
293	!-- reordered locally and therefore no transposition has to be done.
294	IF ( pdims(1) /= 1 ) THEN
295
296	#if defined( __parallel )
297	!
298	!-- Reorder input array for transposition
299	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
300	!$OMP DO
301	DO l = 0, pdims(1) - 1
302	xs = 0 + l * nnx
303	#if __acc_fft_device
304	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
305	!$ACC PRESENT(work, f_in)
306	#endif
307	DO k = nzb_x, nzt_x
308	DO i = xs, xs + nnx - 1
309	DO j = nys_x, nyn_x
310	work(j,i-xs+1,k,l) = f_in(i,j,k)
311	ENDDO
312	ENDDO
313	ENDDO
314	ENDDO
315	!$OMP END PARALLEL
316
317	!
318	!-- Transpose array
319	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
320
321	#if __acc_fft_device
322	#ifndef __cuda_aware_mpi
323	!$ACC UPDATE HOST(work)
324	#else
325	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
326	#endif
327	#endif
328
329	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
330	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
331	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
332	comm1dx, ierr )
333
334	#if __acc_fft_device
335	#ifndef __cuda_aware_mpi
336	!$ACC UPDATE DEVICE(f_inv)
337	#else
338	!$ACC END HOST_DATA
339	#endif
340	#endif
341
342	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
343	#endif
344
345	ELSE
346
347	!
348	!-- Reorder the array in a way that the z index is in first position
349	!$OMP PARALLEL PRIVATE ( i, j, k )
350	!$OMP DO
351	#if __acc_fft_device
352	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
353	!$ACC PRESENT(f_inv, f_in)
354	#endif
355	DO i = nxl, nxr
356	DO j = nys, nyn
357	DO k = 1, nz
358	f_inv(j,i,k) = f_in(i,j,k)
359	ENDDO
360	ENDDO
361	ENDDO
362	!$OMP END PARALLEL
363
364	ENDIF
365
366	END SUBROUTINE transpose_xz
367
368
369	!------------------------------------------------------------------------------!
370	! Description:
371	! ------------
372	!> Resorting data after the transposition from y to x. The transposition itself
373	!> is carried out in transpose_yx
374	!------------------------------------------------------------------------------!
375	SUBROUTINE resort_for_yx( f_inv, f_out )
376
377
378	USE indices, &
379	ONLY: nx
380
381	USE kinds
382
383	USE transpose_indices, &
384	ONLY: nyn_x, nys_x, nzb_x, nzt_x
385
386	IMPLICIT NONE
387
388	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
389	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
390
391
392	INTEGER(iwp) :: i !<
393	INTEGER(iwp) :: j !<
394	INTEGER(iwp) :: k !<
395	!
396	!-- Rearrange indices of input array in order to make data to be send
397	!-- by MPI contiguous
398	!$OMP PARALLEL PRIVATE ( i, j, k )
399	!$OMP DO
400	#if __acc_fft_device
401	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
402	!$ACC PRESENT(f_out, f_inv)
403	#endif
404	DO k = nzb_x, nzt_x
405	DO j = nys_x, nyn_x
406	DO i = 0, nx
407	f_out(i,j,k) = f_inv(j,k,i)
408	ENDDO
409	ENDDO
410	ENDDO
411	!$OMP END PARALLEL
412
413	END SUBROUTINE resort_for_yx
414
415
416	!------------------------------------------------------------------------------!
417	! Description:
418	! ------------
419	!> Transposition of input array (f_in) from y to x. For the input array, all
420	!> elements along y reside on the same PE, while after transposition, all
421	!> elements along x reside on the same PE.
422	!------------------------------------------------------------------------------!
423	SUBROUTINE transpose_yx( f_in, f_inv )
424
425
426	USE cpulog, &
427	ONLY: cpu_log, cpu_log_nowait, log_point_s
428
429	USE indices, &
430	ONLY: nx, ny
431
432	USE kinds
433
434	USE pegrid
435
436	USE transpose_indices, &
437	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
438
439	IMPLICIT NONE
440
441	INTEGER(iwp) :: i !<
442	INTEGER(iwp) :: j !<
443	INTEGER(iwp) :: k !<
444	INTEGER(iwp) :: l !<
445	INTEGER(iwp) :: ys !<
446
447	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
448	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
449
450	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
451	#if __acc_fft_device
452	!$ACC DECLARE CREATE(work)
453	#endif
454
455
456	IF ( numprocs /= 1 ) THEN
457
458	#if defined( __parallel )
459	!
460	!-- Reorder input array for transposition
461	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
462	!$OMP DO
463	DO l = 0, pdims(2) - 1
464	ys = 0 + l * ( nyn_x - nys_x + 1 )
465	#if __acc_fft_device
466	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
467	!$ACC PRESENT(work, f_in)
468	#endif
469	DO i = nxl_y, nxr_y
470	DO k = nzb_y, nzt_y
471	DO j = ys, ys + nyn_x - nys_x
472	work(j-ys+1,k,i,l) = f_in(j,i,k)
473	ENDDO
474	ENDDO
475	ENDDO
476	ENDDO
477	!$OMP END PARALLEL
478
479	!
480	!-- Transpose array
481	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
482
483	#if __acc_fft_device
484	#ifndef __cuda_aware_mpi
485	!$ACC UPDATE HOST(work)
486	#else
487	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
488	#endif
489	#endif
490
491	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
492	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
493	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
494	comm1dy, ierr )
495
496	#if __acc_fft_device
497	#ifndef __cuda_aware_mpi
498	!$ACC UPDATE DEVICE(f_inv)
499	#else
500	!$ACC END HOST_DATA
501	#endif
502	#endif
503
504	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
505	#endif
506
507	ELSE
508
509	!
510	!-- Reorder array f_in the same way as ALLTOALL did it
511	!$OMP PARALLEL PRIVATE ( i, j, k )
512	!$OMP DO
513	#if __acc_fft_device
514	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
515	!$ACC PRESENT(f_inv, f_in)
516	#endif
517	DO i = nxl_y, nxr_y
518	DO k = nzb_y, nzt_y
519	DO j = 0, ny
520	f_inv(j,k,i) = f_in(j,i,k)
521	ENDDO
522	ENDDO
523	ENDDO
524	!$OMP END PARALLEL
525
526	ENDIF
527
528	END SUBROUTINE transpose_yx
529
530
531	!------------------------------------------------------------------------------!
532	! Description:
533	! ------------
534	!> Transposition of input array (f_in) from y to x. For the input array, all
535	!> elements along y reside on the same PE, while after transposition, all
536	!> elements along x reside on the same PE.
537	!> This is a direct transposition for arrays with indices in regular order
538	!> (k,j,i) (cf. transpose_yx).
539	!------------------------------------------------------------------------------!
540	SUBROUTINE transpose_yxd( f_in, f_out )
541
542
543	USE cpulog, &
544	ONLY: cpu_log, log_point_s
545
546	USE indices, &
547	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
548
549	USE kinds
550
551	USE pegrid
552
553	USE transpose_indices, &
554	ONLY: nyn_x, nys_x, nzb_x, nzt_x
555
556	IMPLICIT NONE
557
558	INTEGER(iwp) :: i !<
559	INTEGER(iwp) :: j !<
560	INTEGER(iwp) :: k !<
561	INTEGER(iwp) :: l !<
562	INTEGER(iwp) :: m !<
563	INTEGER(iwp) :: xs !<
564
565	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
566	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
567	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
568	REAL(wp) :: work(nnxnnynnz) !<
569	#if defined( __parallel )
570
571	!
572	!-- Rearrange indices of input array in order to make data to be send
573	!-- by MPI contiguous
574	DO k = 1, nz
575	DO j = nys, nyn
576	DO i = nxl, nxr
577	f_inv(i,k,j) = f_in(k,j,i)
578	ENDDO
579	ENDDO
580	ENDDO
581
582	!
583	!-- Transpose array
584	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
585	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
586	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
587	work(1), sendrecvcount_xy, MPI_REAL, &
588	comm1dx, ierr )
589	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
590
591	!
592	!-- Reorder transposed array
593	m = 0
594	DO l = 0, pdims(1) - 1
595	xs = 0 + l * nnx
596	DO j = nys_x, nyn_x
597	DO k = 1, nz
598	DO i = xs, xs + nnx - 1
599	m = m + 1
600	f_out(i,j,k) = work(m)
601	ENDDO
602	ENDDO
603	ENDDO
604	ENDDO
605
606	#endif
607
608	END SUBROUTINE transpose_yxd
609
610
611	!------------------------------------------------------------------------------!
612	! Description:
613	! ------------
614	!> Resorting data for the transposition from y to z. The transposition itself
615	!> is carried out in transpose_yz
616	!------------------------------------------------------------------------------!
617	SUBROUTINE resort_for_yz( f_in, f_inv )
618
619
620	USE indices, &
621	ONLY: ny
622
623	USE kinds
624
625	USE transpose_indices, &
626	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
627
628	IMPLICIT NONE
629
630	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
631	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
632
633	INTEGER(iwp) :: i !<
634	INTEGER(iwp) :: j !<
635	INTEGER(iwp) :: k !<
636
637	!
638	!-- Rearrange indices of input array in order to make data to be send
639	!-- by MPI contiguous
640	!$OMP PARALLEL PRIVATE ( i, j, k )
641	!$OMP DO
642	#if __acc_fft_device
643	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
644	!$ACC PRESENT(f_inv, f_in)
645	#endif
646	DO k = nzb_y, nzt_y
647	DO i = nxl_y, nxr_y
648	DO j = 0, ny
649	f_inv(i,k,j) = f_in(j,i,k)
650	ENDDO
651	ENDDO
652	ENDDO
653	!$OMP END PARALLEL
654
655	END SUBROUTINE resort_for_yz
656
657
658	!------------------------------------------------------------------------------!
659	! Description:
660	! ------------
661	!> Transposition of input array (f_in) from y to z. For the input array, all
662	!> elements along y reside on the same PE, while after transposition, all
663	!> elements along z reside on the same PE.
664	!------------------------------------------------------------------------------!
665	SUBROUTINE transpose_yz( f_inv, f_out )
666
667
668	USE cpulog, &
669	ONLY: cpu_log, cpu_log_nowait, log_point_s
670
671	USE indices, &
672	ONLY: ny, nz
673
674	USE kinds
675
676	USE pegrid
677
678	USE transpose_indices, &
679	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
680
681	IMPLICIT NONE
682
683	INTEGER(iwp) :: i !<
684	INTEGER(iwp) :: j !<
685	INTEGER(iwp) :: k !<
686	INTEGER(iwp) :: l !<
687	INTEGER(iwp) :: zs !<
688
689	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
690	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
691
692	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
693	#if __acc_fft_device
694	!$ACC DECLARE CREATE(work)
695	#endif
696
697
698	!
699	!-- If the PE grid is one-dimensional along y, only local reordering
700	!-- of the data is necessary and no transposition has to be done.
701	IF ( pdims(1) == 1 ) THEN
702
703	!$OMP PARALLEL PRIVATE ( i, j, k )
704	!$OMP DO
705	#if __acc_fft_device
706	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
707	!$ACC PRESENT(f_out, f_inv)
708	#endif
709	DO j = 0, ny
710	DO k = nzb_y, nzt_y
711	DO i = nxl_y, nxr_y
712	f_out(i,j,k) = f_inv(i,k,j)
713	ENDDO
714	ENDDO
715	ENDDO
716	!$OMP END PARALLEL
717
718	ELSE
719
720	#if defined( __parallel )
721	!
722	!-- Transpose array
723	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
724
725	#if __acc_fft_device
726	#ifndef __cuda_aware_mpi
727	!$ACC UPDATE HOST(f_inv)
728	#else
729	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
730	#endif
731	#endif
732
733	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
734	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
735	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
736	comm1dx, ierr )
737
738	#if __acc_fft_device
739	#ifndef __cuda_aware_mpi
740	!$ACC UPDATE DEVICE(work)
741	#else
742	!$ACC END HOST_DATA
743	#endif
744	#endif
745
746	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
747
748	!
749	!-- Reorder transposed array
750	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
751	!$OMP DO
752	DO l = 0, pdims(1) - 1
753	zs = 1 + l * ( nzt_y - nzb_y + 1 )
754	#if __acc_fft_device
755	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
756	!$ACC PRESENT(f_out, work)
757	#endif
758	DO j = nys_z, nyn_z
759	DO k = zs, zs + nzt_y - nzb_y
760	DO i = nxl_z, nxr_z
761	f_out(i,j,k) = work(i,k-zs+1,j,l)
762	ENDDO
763	ENDDO
764	ENDDO
765	ENDDO
766	!$OMP END PARALLEL
767	#endif
768
769	ENDIF
770
771	END SUBROUTINE transpose_yz
772
773
774	!------------------------------------------------------------------------------!
775	! Description:
776	! ------------
777	!> Resorting data for the transposition from z to x. The transposition itself
778	!> is carried out in transpose_zx
779	!------------------------------------------------------------------------------!
780	SUBROUTINE resort_for_zx( f_in, f_inv )
781
782
783	USE indices, &
784	ONLY: nxl, nxr, nyn, nys, nz
785
786	USE kinds
787
788	IMPLICIT NONE
789
790	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
791	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
792
793	INTEGER(iwp) :: i !<
794	INTEGER(iwp) :: j !<
795	INTEGER(iwp) :: k !<
796
797	!
798	!-- Rearrange indices of input array in order to make data to be send
799	!-- by MPI contiguous
800	!$OMP PARALLEL PRIVATE ( i, j, k )
801	!$OMP DO
802	#if __acc_fft_device
803	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
804	!$ACC PRESENT(f_in, f_inv)
805	#endif
806	DO i = nxl, nxr
807	DO j = nys, nyn
808	DO k = 1,nz
809	f_inv(j,i,k) = f_in(k,j,i)
810	ENDDO
811	ENDDO
812	ENDDO
813	!$OMP END PARALLEL
814
815	END SUBROUTINE resort_for_zx
816
817
818	!------------------------------------------------------------------------------!
819	! Description:
820	! ------------
821	!> Transposition of input array (f_in) from z to x. For the input array, all
822	!> elements along z reside on the same PE, while after transposition, all
823	!> elements along x reside on the same PE.
824	!------------------------------------------------------------------------------!
825	SUBROUTINE transpose_zx( f_inv, f_out )
826
827
828	USE cpulog, &
829	ONLY: cpu_log, cpu_log_nowait, log_point_s
830
831	USE indices, &
832	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
833
834	USE kinds
835
836	USE pegrid
837
838	USE transpose_indices, &
839	ONLY: nyn_x, nys_x, nzb_x, nzt_x
840
841	IMPLICIT NONE
842
843	INTEGER(iwp) :: i !<
844	INTEGER(iwp) :: j !<
845	INTEGER(iwp) :: k !<
846	INTEGER(iwp) :: l !<
847	INTEGER(iwp) :: xs !<
848
849	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
850	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
851
852	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
853	#if __acc_fft_device
854	!$ACC DECLARE CREATE(work)
855	#endif
856
857
858	!
859	!-- If the PE grid is one-dimensional along y, only local reordering
860	!-- of the data is necessary and no transposition has to be done.
861	IF ( pdims(1) == 1 ) THEN
862
863	!$OMP PARALLEL PRIVATE ( i, j, k )
864	!$OMP DO
865	#if __acc_fft_device
866	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
867	!$ACC PRESENT(f_out, f_inv)
868	#endif
869	DO k = 1, nz
870	DO i = nxl, nxr
871	DO j = nys, nyn
872	f_out(i,j,k) = f_inv(j,i,k)
873	ENDDO
874	ENDDO
875	ENDDO
876	!$OMP END PARALLEL
877
878	ELSE
879
880	#if defined( __parallel )
881	!
882	!-- Transpose array
883	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
884
885	#if __acc_fft_device
886	#ifndef __cuda_aware_mpi
887	!$ACC UPDATE HOST(f_inv)
888	#else
889	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
890	#endif
891	#endif
892
893	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
894	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
895	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
896	comm1dx, ierr )
897
898	#if __acc_fft_device
899	#ifndef __cuda_aware_mpi
900	!$ACC UPDATE DEVICE(work)
901	#else
902	!$ACC END HOST_DATA
903	#endif
904	#endif
905
906	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
907
908	!
909	!-- Reorder transposed array
910	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
911	!$OMP DO
912	DO l = 0, pdims(1) - 1
913	xs = 0 + l * nnx
914	#if __acc_fft_device
915	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
916	!$ACC PRESENT(f_out, work)
917	#endif
918	DO k = nzb_x, nzt_x
919	DO i = xs, xs + nnx - 1
920	DO j = nys_x, nyn_x
921	f_out(i,j,k) = work(j,i-xs+1,k,l)
922	ENDDO
923	ENDDO
924	ENDDO
925	ENDDO
926	!$OMP END PARALLEL
927	#endif
928
929	ENDIF
930
931	END SUBROUTINE transpose_zx
932
933
934	!------------------------------------------------------------------------------!
935	! Description:
936	! ------------
937	!> Resorting data after the transposition from z to y. The transposition itself
938	!> is carried out in transpose_zy
939	!------------------------------------------------------------------------------!
940	SUBROUTINE resort_for_zy( f_inv, f_out )
941
942
943	USE indices, &
944	ONLY: ny
945
946	USE kinds
947
948	USE transpose_indices, &
949	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
950
951	IMPLICIT NONE
952
953	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
954	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
955
956
957	INTEGER(iwp) :: i !<
958	INTEGER(iwp) :: j !<
959	INTEGER(iwp) :: k !<
960
961	!
962	!-- Rearrange indices of input array in order to make data to be send
963	!-- by MPI contiguous
964	!$OMP PARALLEL PRIVATE ( i, j, k )
965	!$OMP DO
966	#if __acc_fft_device
967	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
968	!$ACC PRESENT(f_out, f_inv)
969	#endif
970	DO k = nzb_y, nzt_y
971	DO i = nxl_y, nxr_y
972	DO j = 0, ny
973	f_out(j,i,k) = f_inv(i,k,j)
974	ENDDO
975	ENDDO
976	ENDDO
977	!$OMP END PARALLEL
978
979	END SUBROUTINE resort_for_zy
980
981
982	!------------------------------------------------------------------------------!
983	! Description:cpu_log_nowait
984	! ------------
985	!> Transposition of input array (f_in) from z to y. For the input array, all
986	!> elements along z reside on the same PE, while after transposition, all
987	!> elements along y reside on the same PE.
988	!------------------------------------------------------------------------------!
989	SUBROUTINE transpose_zy( f_in, f_inv )
990
991
992	USE cpulog, &
993	ONLY: cpu_log, cpu_log_nowait, log_point_s
994
995	USE indices, &
996	ONLY: ny, nz
997
998	USE kinds
999
1000	USE pegrid
1001
1002	USE transpose_indices, &
1003	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1004
1005	IMPLICIT NONE
1006
1007	INTEGER(iwp) :: i !<
1008	INTEGER(iwp) :: j !<
1009	INTEGER(iwp) :: k !<
1010	INTEGER(iwp) :: l !<
1011	INTEGER(iwp) :: zs !<
1012
1013	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1014	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1015
1016	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1017	#if __acc_fft_device
1018	!$ACC DECLARE CREATE(work)
1019	#endif
1020
1021	!
1022	!-- If the PE grid is one-dimensional along y, the array has only to be
1023	!-- reordered locally and therefore no transposition has to be done.
1024	IF ( pdims(1) /= 1 ) THEN
1025
1026	#if defined( __parallel )
1027	!
1028	!-- Reorder input array for transposition
1029	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1030	!$OMP DO
1031	DO l = 0, pdims(1) - 1
1032	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1033	#if __acc_fft_device
1034	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1035	!$ACC PRESENT(work, f_in)
1036	#endif
1037	DO j = nys_z, nyn_z
1038	DO k = zs, zs + nzt_y - nzb_y
1039	DO i = nxl_z, nxr_z
1040	work(i,k-zs+1,j,l) = f_in(i,j,k)
1041	ENDDO
1042	ENDDO
1043	ENDDO
1044	ENDDO
1045	!$OMP END PARALLEL
1046
1047	!
1048	!-- Transpose array
1049	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1050
1051	#if __acc_fft_device
1052	#ifndef __cuda_aware_mpi
1053	!$ACC UPDATE HOST(work)
1054	#else
1055	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1056	#endif
1057	#endif
1058
1059	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1060	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1061	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1062	comm1dx, ierr )
1063
1064	#if __acc_fft_device
1065	#ifndef __cuda_aware_mpi
1066	!$ACC UPDATE DEVICE(f_inv)
1067	#else
1068	!$ACC END HOST_DATA
1069	#endif
1070	#endif
1071
1072	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1073	#endif
1074
1075	ELSE
1076	!
1077	!-- Reorder the array in the same way like ALLTOALL did it
1078	!$OMP PARALLEL PRIVATE ( i, j, k )
1079	!$OMP DO
1080	#if __acc_fft_device
1081	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1082	!$ACC PRESENT(f_inv, f_in)
1083	#endif
1084	DO k = nzb_y, nzt_y
1085	DO j = 0, ny
1086	DO i = nxl_y, nxr_y
1087	f_inv(i,k,j) = f_in(i,j,k)
1088	ENDDO
1089	ENDDO
1090	ENDDO
1091	!$OMP END PARALLEL
1092
1093	ENDIF
1094
1095	END SUBROUTINE transpose_zy
1096
1097
1098	!------------------------------------------------------------------------------!
1099	! Description:
1100	! ------------
1101	!> Transposition of input array (f_in) from z to y. For the input array, all
1102	!> elements along z reside on the same PE, while after transposition, all
1103	!> elements along y reside on the same PE.
1104	!> This is a direct transposition for arrays with indices in regular order
1105	!> (k,j,i) (cf. transpose_zy).
1106	!------------------------------------------------------------------------------!
1107	SUBROUTINE transpose_zyd( f_in, f_out )
1108
1109
1110	USE cpulog, &
1111	ONLY: cpu_log, log_point_s
1112
1113	USE indices, &
1114	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1115
1116	USE kinds
1117
1118	USE pegrid
1119
1120	USE transpose_indices, &
1121	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1122
1123	IMPLICIT NONE
1124
1125	INTEGER(iwp) :: i !<
1126	INTEGER(iwp) :: j !<
1127	INTEGER(iwp) :: k !<
1128	INTEGER(iwp) :: l !<
1129	INTEGER(iwp) :: m !<
1130	INTEGER(iwp) :: ys !<
1131
1132	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1133	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1134	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1135	REAL(wp) :: work(nnxnnynnz) !<
1136
1137	#if defined( __parallel )
1138
1139	!
1140	!-- Rearrange indices of input array in order to make data to be send
1141	!-- by MPI contiguous
1142	DO i = nxl, nxr
1143	DO j = nys, nyn
1144	DO k = 1, nz
1145	f_inv(j,i,k) = f_in(k,j,i)
1146	ENDDO
1147	ENDDO
1148	ENDDO
1149
1150	!
1151	!-- Move data to different array, because memory location of work1 is
1152	!-- needed further below (work1 = work2).
1153	!-- If the PE grid is one-dimensional along x, only local reordering
1154	!-- of the data is necessary and no transposition has to be done.
1155	IF ( pdims(2) == 1 ) THEN
1156	DO k = 1, nz
1157	DO i = nxl, nxr
1158	DO j = nys, nyn
1159	f_out(j,i,k) = f_inv(j,i,k)
1160	ENDDO
1161	ENDDO
1162	ENDDO
1163	RETURN
1164	ENDIF
1165
1166	!
1167	!-- Transpose array
1168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1169	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1170	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1171	work(1), sendrecvcount_zyd, MPI_REAL, &
1172	comm1dy, ierr )
1173	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1174
1175	!
1176	!-- Reorder transposed array
1177	m = 0
1178	DO l = 0, pdims(2) - 1
1179	ys = 0 + l * nny
1180	DO k = nzb_yd, nzt_yd
1181	DO i = nxl_yd, nxr_yd
1182	DO j = ys, ys + nny - 1
1183	m = m + 1
1184	f_out(j,i,k) = work(m)
1185	ENDDO
1186	ENDDO
1187	ENDDO
1188	ENDDO
1189
1190	#endif
1191
1192	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |