Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4181

Last change on this file since 4181 was 4181, checked in by raasch, 5 years ago
bugfix: define directive added, which has been deleted by mistake in r4180
Property svn:keywords set to `Id`
File size: 32.2 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4181 2019-08-22 06:37:36Z raasch $
27	! bugfix: define directive added, which has been deleted by mistake in r4180
28	!
29	! 4180 2019-08-21 14:37:54Z scharf
30	! loop reordering for performance optimization
31	!
32	! 3832 2019-03-28 13:16:58Z raasch
33	! loop reordering for performance optimization
34	!
35	! 3694 2019-01-23 17:01:49Z knoop
36	! OpenACC port for SPEC
37	!
38	!------------------------------------------------------------------------------!
39	! Description:
40	! ------------
41	!> Resorting data for the transposition from x to y. The transposition itself
42	!> is carried out in transpose_xy
43	!------------------------------------------------------------------------------!
44
45	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
46
47	SUBROUTINE resort_for_xy( f_in, f_inv )
48
49
50	USE indices, &
51	ONLY: nx
52
53	USE kinds
54
55	USE transpose_indices, &
56	ONLY: nyn_x, nys_x, nzb_x, nzt_x
57
58	IMPLICIT NONE
59
60	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
61	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
62
63
64	INTEGER(iwp) :: i !<
65	INTEGER(iwp) :: j !<
66	INTEGER(iwp) :: k !<
67	!
68	!-- Rearrange indices of input array in order to make data to be send
69	!-- by MPI contiguous
70	!$OMP PARALLEL PRIVATE ( i, j, k )
71	!$OMP DO
72	#if __acc_fft_device
73	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
74	!$ACC PRESENT(f_inv, f_in)
75	#endif
76	DO k = nzb_x, nzt_x
77	DO j = nys_x, nyn_x
78	DO i = 0, nx
79	f_inv(j,k,i) = f_in(i,j,k)
80	ENDDO
81	ENDDO
82	ENDDO
83	!$OMP END PARALLEL
84
85	END SUBROUTINE resort_for_xy
86
87
88	!------------------------------------------------------------------------------!
89	! Description:
90	! ------------
91	!> Transposition of input array (f_in) from x to y. For the input array, all
92	!> elements along x reside on the same PE, while after transposition, all
93	!> elements along y reside on the same PE.
94	!------------------------------------------------------------------------------!
95	SUBROUTINE transpose_xy( f_inv, f_out )
96
97
98	USE cpulog, &
99	ONLY: cpu_log, cpu_log_nowait, log_point_s
100
101	USE indices, &
102	ONLY: nx, ny
103
104	USE kinds
105
106	USE pegrid
107
108	USE transpose_indices, &
109	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
110
111	IMPLICIT NONE
112
113	INTEGER(iwp) :: i !<
114	INTEGER(iwp) :: j !<
115	INTEGER(iwp) :: k !<
116	INTEGER(iwp) :: l !<
117	INTEGER(iwp) :: ys !<
118
119	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
120	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
121
122	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
123	#if __acc_fft_device
124	!$ACC DECLARE CREATE(work)
125	#endif
126
127
128	IF ( numprocs /= 1 ) THEN
129
130	#if defined( __parallel )
131	!
132	!-- Transpose array
133	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
134
135	#if __acc_fft_device
136	#ifndef __cuda_aware_mpi
137	!$ACC UPDATE HOST(f_inv)
138	#else
139	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
140	#endif
141	#endif
142
143	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
144	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
145	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
146	comm1dy, ierr )
147
148	#if __acc_fft_device
149	#ifndef __cuda_aware_mpi
150	!$ACC UPDATE DEVICE(work)
151	#else
152	!$ACC END HOST_DATA
153	#endif
154	#endif
155
156	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
157
158	!
159	!-- Reorder transposed array
160	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
161	!$OMP DO
162	DO l = 0, pdims(2) - 1
163	ys = 0 + l * ( nyn_x - nys_x + 1 )
164	#if __acc_fft_device
165	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
166	!$ACC PRESENT(f_out, work)
167	#endif
168	DO i = nxl_y, nxr_y
169	DO k = nzb_y, nzt_y
170	DO j = ys, ys + nyn_x - nys_x
171	f_out(j,i,k) = work(j-ys+1,k,i,l)
172	ENDDO
173	ENDDO
174	ENDDO
175	ENDDO
176	!$OMP END PARALLEL
177	#endif
178
179	ELSE
180
181	!
182	!-- Reorder transposed array
183	!$OMP PARALLEL PRIVATE ( i, j, k )
184	!$OMP DO
185	#if __acc_fft_device
186	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
187	!$ACC PRESENT(f_out, f_inv)
188	#endif
189	DO k = nzb_y, nzt_y
190	DO i = nxl_y, nxr_y
191	DO j = 0, ny
192	f_out(j,i,k) = f_inv(j,k,i)
193	ENDDO
194	ENDDO
195	ENDDO
196	!$OMP END PARALLEL
197
198	ENDIF
199
200	END SUBROUTINE transpose_xy
201
202
203	!------------------------------------------------------------------------------!
204	! Description:
205	! ------------
206	!> Resorting data after the transposition from x to z. The transposition itself
207	!> is carried out in transpose_xz
208	!------------------------------------------------------------------------------!
209	SUBROUTINE resort_for_xz( f_inv, f_out )
210
211
212	USE indices, &
213	ONLY: nxl, nxr, nyn, nys, nz
214
215	USE kinds
216
217	IMPLICIT NONE
218
219	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
220	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
221
222	INTEGER(iwp) :: i !<
223	INTEGER(iwp) :: j !<
224	INTEGER(iwp) :: k !<
225	!
226	!-- Rearrange indices of input array in order to make data to be send
227	!-- by MPI contiguous.
228	!-- In case of parallel fft/transposition, scattered store is faster in
229	!-- backward direction!!!
230	!$OMP PARALLEL PRIVATE ( i, j, k )
231	!$OMP DO
232	#if __acc_fft_device
233	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
234	!$ACC PRESENT(f_out, f_inv)
235	#endif
236	DO i = nxl, nxr
237	DO j = nys, nyn
238	DO k = 1, nz
239	f_out(k,j,i) = f_inv(j,i,k)
240	ENDDO
241	ENDDO
242	ENDDO
243	!$OMP END PARALLEL
244
245	END SUBROUTINE resort_for_xz
246
247
248	!------------------------------------------------------------------------------!
249	! Description:
250	! ------------
251	!> Transposition of input array (f_in) from x to z. For the input array, all
252	!> elements along x reside on the same PE, while after transposition, all
253	!> elements along z reside on the same PE.
254	!------------------------------------------------------------------------------!
255	SUBROUTINE transpose_xz( f_in, f_inv )
256
257
258	USE cpulog, &
259	ONLY: cpu_log, cpu_log_nowait, log_point_s
260
261	USE indices, &
262	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
263
264	USE kinds
265
266	USE pegrid
267
268	USE transpose_indices, &
269	ONLY: nyn_x, nys_x, nzb_x, nzt_x
270
271	IMPLICIT NONE
272
273	INTEGER(iwp) :: i !<
274	INTEGER(iwp) :: j !<
275	INTEGER(iwp) :: k !<
276	INTEGER(iwp) :: l !<
277	INTEGER(iwp) :: xs !<
278
279	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
280	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
281
282	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
283	#if __acc_fft_device
284	!$ACC DECLARE CREATE(work)
285	#endif
286
287
288	!
289	!-- If the PE grid is one-dimensional along y, the array has only to be
290	!-- reordered locally and therefore no transposition has to be done.
291	IF ( pdims(1) /= 1 ) THEN
292
293	#if defined( __parallel )
294	!
295	!-- Reorder input array for transposition
296	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
297	!$OMP DO
298	DO l = 0, pdims(1) - 1
299	xs = 0 + l * nnx
300	#if __acc_fft_device
301	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
302	!$ACC PRESENT(work, f_in)
303	#endif
304	DO k = nzb_x, nzt_x
305	DO i = xs, xs + nnx - 1
306	DO j = nys_x, nyn_x
307	work(j,i-xs+1,k,l) = f_in(i,j,k)
308	ENDDO
309	ENDDO
310	ENDDO
311	ENDDO
312	!$OMP END PARALLEL
313
314	!
315	!-- Transpose array
316	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
317
318	#if __acc_fft_device
319	#ifndef __cuda_aware_mpi
320	!$ACC UPDATE HOST(work)
321	#else
322	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
323	#endif
324	#endif
325
326	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
327	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
328	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
329	comm1dx, ierr )
330
331	#if __acc_fft_device
332	#ifndef __cuda_aware_mpi
333	!$ACC UPDATE DEVICE(f_inv)
334	#else
335	!$ACC END HOST_DATA
336	#endif
337	#endif
338
339	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
340	#endif
341
342	ELSE
343
344	!
345	!-- Reorder the array in a way that the z index is in first position
346	!$OMP PARALLEL PRIVATE ( i, j, k )
347	!$OMP DO
348	#if __acc_fft_device
349	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
350	!$ACC PRESENT(f_inv, f_in)
351	#endif
352	DO i = nxl, nxr
353	DO j = nys, nyn
354	DO k = 1, nz
355	f_inv(j,i,k) = f_in(i,j,k)
356	ENDDO
357	ENDDO
358	ENDDO
359	!$OMP END PARALLEL
360
361	ENDIF
362
363	END SUBROUTINE transpose_xz
364
365
366	!------------------------------------------------------------------------------!
367	! Description:
368	! ------------
369	!> Resorting data after the transposition from y to x. The transposition itself
370	!> is carried out in transpose_yx
371	!------------------------------------------------------------------------------!
372	SUBROUTINE resort_for_yx( f_inv, f_out )
373
374
375	USE indices, &
376	ONLY: nx
377
378	USE kinds
379
380	USE transpose_indices, &
381	ONLY: nyn_x, nys_x, nzb_x, nzt_x
382
383	IMPLICIT NONE
384
385	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
386	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
387
388
389	INTEGER(iwp) :: i !<
390	INTEGER(iwp) :: j !<
391	INTEGER(iwp) :: k !<
392	!
393	!-- Rearrange indices of input array in order to make data to be send
394	!-- by MPI contiguous
395	!$OMP PARALLEL PRIVATE ( i, j, k )
396	!$OMP DO
397	#if __acc_fft_device
398	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
399	!$ACC PRESENT(f_out, f_inv)
400	#endif
401	DO k = nzb_x, nzt_x
402	DO j = nys_x, nyn_x
403	DO i = 0, nx
404	f_out(i,j,k) = f_inv(j,k,i)
405	ENDDO
406	ENDDO
407	ENDDO
408	!$OMP END PARALLEL
409
410	END SUBROUTINE resort_for_yx
411
412
413	!------------------------------------------------------------------------------!
414	! Description:
415	! ------------
416	!> Transposition of input array (f_in) from y to x. For the input array, all
417	!> elements along y reside on the same PE, while after transposition, all
418	!> elements along x reside on the same PE.
419	!------------------------------------------------------------------------------!
420	SUBROUTINE transpose_yx( f_in, f_inv )
421
422
423	USE cpulog, &
424	ONLY: cpu_log, cpu_log_nowait, log_point_s
425
426	USE indices, &
427	ONLY: nx, ny
428
429	USE kinds
430
431	USE pegrid
432
433	USE transpose_indices, &
434	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
435
436	IMPLICIT NONE
437
438	INTEGER(iwp) :: i !<
439	INTEGER(iwp) :: j !<
440	INTEGER(iwp) :: k !<
441	INTEGER(iwp) :: l !<
442	INTEGER(iwp) :: ys !<
443
444	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
445	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
446
447	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
448	#if __acc_fft_device
449	!$ACC DECLARE CREATE(work)
450	#endif
451
452
453	IF ( numprocs /= 1 ) THEN
454
455	#if defined( __parallel )
456	!
457	!-- Reorder input array for transposition
458	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
459	!$OMP DO
460	DO l = 0, pdims(2) - 1
461	ys = 0 + l * ( nyn_x - nys_x + 1 )
462	#if __acc_fft_device
463	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
464	!$ACC PRESENT(work, f_in)
465	#endif
466	DO i = nxl_y, nxr_y
467	DO k = nzb_y, nzt_y
468	DO j = ys, ys + nyn_x - nys_x
469	work(j-ys+1,k,i,l) = f_in(j,i,k)
470	ENDDO
471	ENDDO
472	ENDDO
473	ENDDO
474	!$OMP END PARALLEL
475
476	!
477	!-- Transpose array
478	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
479
480	#if __acc_fft_device
481	#ifndef __cuda_aware_mpi
482	!$ACC UPDATE HOST(work)
483	#else
484	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
485	#endif
486	#endif
487
488	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
489	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
490	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
491	comm1dy, ierr )
492
493	#if __acc_fft_device
494	#ifndef __cuda_aware_mpi
495	!$ACC UPDATE DEVICE(f_inv)
496	#else
497	!$ACC END HOST_DATA
498	#endif
499	#endif
500
501	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
502	#endif
503
504	ELSE
505
506	!
507	!-- Reorder array f_in the same way as ALLTOALL did it
508	!$OMP PARALLEL PRIVATE ( i, j, k )
509	!$OMP DO
510	#if __acc_fft_device
511	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
512	!$ACC PRESENT(f_inv, f_in)
513	#endif
514	DO i = nxl_y, nxr_y
515	DO k = nzb_y, nzt_y
516	DO j = 0, ny
517	f_inv(j,k,i) = f_in(j,i,k)
518	ENDDO
519	ENDDO
520	ENDDO
521	!$OMP END PARALLEL
522
523	ENDIF
524
525	END SUBROUTINE transpose_yx
526
527
528	!------------------------------------------------------------------------------!
529	! Description:
530	! ------------
531	!> Transposition of input array (f_in) from y to x. For the input array, all
532	!> elements along y reside on the same PE, while after transposition, all
533	!> elements along x reside on the same PE.
534	!> This is a direct transposition for arrays with indices in regular order
535	!> (k,j,i) (cf. transpose_yx).
536	!------------------------------------------------------------------------------!
537	SUBROUTINE transpose_yxd( f_in, f_out )
538
539
540	USE cpulog, &
541	ONLY: cpu_log, log_point_s
542
543	USE indices, &
544	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
545
546	USE kinds
547
548	USE pegrid
549
550	USE transpose_indices, &
551	ONLY: nyn_x, nys_x, nzb_x, nzt_x
552
553	IMPLICIT NONE
554
555	INTEGER(iwp) :: i !<
556	INTEGER(iwp) :: j !<
557	INTEGER(iwp) :: k !<
558	INTEGER(iwp) :: l !<
559	INTEGER(iwp) :: m !<
560	INTEGER(iwp) :: xs !<
561
562	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
563	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
564	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
565	REAL(wp) :: work(nnxnnynnz) !<
566	#if defined( __parallel )
567
568	!
569	!-- Rearrange indices of input array in order to make data to be send
570	!-- by MPI contiguous
571	DO k = 1, nz
572	DO j = nys, nyn
573	DO i = nxl, nxr
574	f_inv(i,k,j) = f_in(k,j,i)
575	ENDDO
576	ENDDO
577	ENDDO
578
579	!
580	!-- Transpose array
581	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
582	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
583	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
584	work(1), sendrecvcount_xy, MPI_REAL, &
585	comm1dx, ierr )
586	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
587
588	!
589	!-- Reorder transposed array
590	m = 0
591	DO l = 0, pdims(1) - 1
592	xs = 0 + l * nnx
593	DO j = nys_x, nyn_x
594	DO k = 1, nz
595	DO i = xs, xs + nnx - 1
596	m = m + 1
597	f_out(i,j,k) = work(m)
598	ENDDO
599	ENDDO
600	ENDDO
601	ENDDO
602
603	#endif
604
605	END SUBROUTINE transpose_yxd
606
607
608	!------------------------------------------------------------------------------!
609	! Description:
610	! ------------
611	!> Resorting data for the transposition from y to z. The transposition itself
612	!> is carried out in transpose_yz
613	!------------------------------------------------------------------------------!
614	SUBROUTINE resort_for_yz( f_in, f_inv )
615
616
617	USE indices, &
618	ONLY: ny
619
620	USE kinds
621
622	USE transpose_indices, &
623	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
624
625	IMPLICIT NONE
626
627	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
628	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
629
630	INTEGER(iwp) :: i !<
631	INTEGER(iwp) :: j !<
632	INTEGER(iwp) :: k !<
633
634	!
635	!-- Rearrange indices of input array in order to make data to be send
636	!-- by MPI contiguous
637	!$OMP PARALLEL PRIVATE ( i, j, k )
638	!$OMP DO
639	#if __acc_fft_device
640	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
641	!$ACC PRESENT(f_inv, f_in)
642	#endif
643	DO k = nzb_y, nzt_y
644	DO i = nxl_y, nxr_y
645	DO j = 0, ny
646	f_inv(i,k,j) = f_in(j,i,k)
647	ENDDO
648	ENDDO
649	ENDDO
650	!$OMP END PARALLEL
651
652	END SUBROUTINE resort_for_yz
653
654
655	!------------------------------------------------------------------------------!
656	! Description:
657	! ------------
658	!> Transposition of input array (f_in) from y to z. For the input array, all
659	!> elements along y reside on the same PE, while after transposition, all
660	!> elements along z reside on the same PE.
661	!------------------------------------------------------------------------------!
662	SUBROUTINE transpose_yz( f_inv, f_out )
663
664
665	USE cpulog, &
666	ONLY: cpu_log, cpu_log_nowait, log_point_s
667
668	USE indices, &
669	ONLY: ny, nz
670
671	USE kinds
672
673	USE pegrid
674
675	USE transpose_indices, &
676	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
677
678	IMPLICIT NONE
679
680	INTEGER(iwp) :: i !<
681	INTEGER(iwp) :: j !<
682	INTEGER(iwp) :: k !<
683	INTEGER(iwp) :: l !<
684	INTEGER(iwp) :: zs !<
685
686	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
687	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
688
689	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
690	#if __acc_fft_device
691	!$ACC DECLARE CREATE(work)
692	#endif
693
694
695	!
696	!-- If the PE grid is one-dimensional along y, only local reordering
697	!-- of the data is necessary and no transposition has to be done.
698	IF ( pdims(1) == 1 ) THEN
699
700	!$OMP PARALLEL PRIVATE ( i, j, k )
701	!$OMP DO
702	#if __acc_fft_device
703	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
704	!$ACC PRESENT(f_out, f_inv)
705	#endif
706	DO j = 0, ny
707	DO k = nzb_y, nzt_y
708	DO i = nxl_y, nxr_y
709	f_out(i,j,k) = f_inv(i,k,j)
710	ENDDO
711	ENDDO
712	ENDDO
713	!$OMP END PARALLEL
714
715	ELSE
716
717	#if defined( __parallel )
718	!
719	!-- Transpose array
720	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
721
722	#if __acc_fft_device
723	#ifndef __cuda_aware_mpi
724	!$ACC UPDATE HOST(f_inv)
725	#else
726	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
727	#endif
728	#endif
729
730	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
731	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
732	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
733	comm1dx, ierr )
734
735	#if __acc_fft_device
736	#ifndef __cuda_aware_mpi
737	!$ACC UPDATE DEVICE(work)
738	#else
739	!$ACC END HOST_DATA
740	#endif
741	#endif
742
743	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
744
745	!
746	!-- Reorder transposed array
747	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
748	!$OMP DO
749	DO l = 0, pdims(1) - 1
750	zs = 1 + l * ( nzt_y - nzb_y + 1 )
751	#if __acc_fft_device
752	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
753	!$ACC PRESENT(f_out, work)
754	#endif
755	DO j = nys_z, nyn_z
756	DO k = zs, zs + nzt_y - nzb_y
757	DO i = nxl_z, nxr_z
758	f_out(i,j,k) = work(i,k-zs+1,j,l)
759	ENDDO
760	ENDDO
761	ENDDO
762	ENDDO
763	!$OMP END PARALLEL
764	#endif
765
766	ENDIF
767
768	END SUBROUTINE transpose_yz
769
770
771	!------------------------------------------------------------------------------!
772	! Description:
773	! ------------
774	!> Resorting data for the transposition from z to x. The transposition itself
775	!> is carried out in transpose_zx
776	!------------------------------------------------------------------------------!
777	SUBROUTINE resort_for_zx( f_in, f_inv )
778
779
780	USE indices, &
781	ONLY: nxl, nxr, nyn, nys, nz
782
783	USE kinds
784
785	IMPLICIT NONE
786
787	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
788	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
789
790	INTEGER(iwp) :: i !<
791	INTEGER(iwp) :: j !<
792	INTEGER(iwp) :: k !<
793
794	!
795	!-- Rearrange indices of input array in order to make data to be send
796	!-- by MPI contiguous
797	!$OMP PARALLEL PRIVATE ( i, j, k )
798	!$OMP DO
799	#if __acc_fft_device
800	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
801	!$ACC PRESENT(f_in, f_inv)
802	#endif
803	DO i = nxl, nxr
804	DO j = nys, nyn
805	DO k = 1,nz
806	f_inv(j,i,k) = f_in(k,j,i)
807	ENDDO
808	ENDDO
809	ENDDO
810	!$OMP END PARALLEL
811
812	END SUBROUTINE resort_for_zx
813
814
815	!------------------------------------------------------------------------------!
816	! Description:
817	! ------------
818	!> Transposition of input array (f_in) from z to x. For the input array, all
819	!> elements along z reside on the same PE, while after transposition, all
820	!> elements along x reside on the same PE.
821	!------------------------------------------------------------------------------!
822	SUBROUTINE transpose_zx( f_inv, f_out )
823
824
825	USE cpulog, &
826	ONLY: cpu_log, cpu_log_nowait, log_point_s
827
828	USE indices, &
829	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
830
831	USE kinds
832
833	USE pegrid
834
835	USE transpose_indices, &
836	ONLY: nyn_x, nys_x, nzb_x, nzt_x
837
838	IMPLICIT NONE
839
840	INTEGER(iwp) :: i !<
841	INTEGER(iwp) :: j !<
842	INTEGER(iwp) :: k !<
843	INTEGER(iwp) :: l !<
844	INTEGER(iwp) :: xs !<
845
846	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
847	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
848
849	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
850	#if __acc_fft_device
851	!$ACC DECLARE CREATE(work)
852	#endif
853
854
855	!
856	!-- If the PE grid is one-dimensional along y, only local reordering
857	!-- of the data is necessary and no transposition has to be done.
858	IF ( pdims(1) == 1 ) THEN
859
860	!$OMP PARALLEL PRIVATE ( i, j, k )
861	!$OMP DO
862	#if __acc_fft_device
863	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
864	!$ACC PRESENT(f_out, f_inv)
865	#endif
866	DO k = 1, nz
867	DO i = nxl, nxr
868	DO j = nys, nyn
869	f_out(i,j,k) = f_inv(j,i,k)
870	ENDDO
871	ENDDO
872	ENDDO
873	!$OMP END PARALLEL
874
875	ELSE
876
877	#if defined( __parallel )
878	!
879	!-- Transpose array
880	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
881
882	#if __acc_fft_device
883	#ifndef __cuda_aware_mpi
884	!$ACC UPDATE HOST(f_inv)
885	#else
886	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
887	#endif
888	#endif
889
890	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
891	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
892	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
893	comm1dx, ierr )
894
895	#if __acc_fft_device
896	#ifndef __cuda_aware_mpi
897	!$ACC UPDATE DEVICE(work)
898	#else
899	!$ACC END HOST_DATA
900	#endif
901	#endif
902
903	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
904
905	!
906	!-- Reorder transposed array
907	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
908	!$OMP DO
909	DO l = 0, pdims(1) - 1
910	xs = 0 + l * nnx
911	#if __acc_fft_device
912	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
913	!$ACC PRESENT(f_out, work)
914	#endif
915	DO k = nzb_x, nzt_x
916	DO i = xs, xs + nnx - 1
917	DO j = nys_x, nyn_x
918	f_out(i,j,k) = work(j,i-xs+1,k,l)
919	ENDDO
920	ENDDO
921	ENDDO
922	ENDDO
923	!$OMP END PARALLEL
924	#endif
925
926	ENDIF
927
928	END SUBROUTINE transpose_zx
929
930
931	!------------------------------------------------------------------------------!
932	! Description:
933	! ------------
934	!> Resorting data after the transposition from z to y. The transposition itself
935	!> is carried out in transpose_zy
936	!------------------------------------------------------------------------------!
937	SUBROUTINE resort_for_zy( f_inv, f_out )
938
939
940	USE indices, &
941	ONLY: ny
942
943	USE kinds
944
945	USE transpose_indices, &
946	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
947
948	IMPLICIT NONE
949
950	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
951	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
952
953
954	INTEGER(iwp) :: i !<
955	INTEGER(iwp) :: j !<
956	INTEGER(iwp) :: k !<
957
958	!
959	!-- Rearrange indices of input array in order to make data to be send
960	!-- by MPI contiguous
961	!$OMP PARALLEL PRIVATE ( i, j, k )
962	!$OMP DO
963	#if __acc_fft_device
964	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
965	!$ACC PRESENT(f_out, f_inv)
966	#endif
967	DO k = nzb_y, nzt_y
968	DO i = nxl_y, nxr_y
969	DO j = 0, ny
970	f_out(j,i,k) = f_inv(i,k,j)
971	ENDDO
972	ENDDO
973	ENDDO
974	!$OMP END PARALLEL
975
976	END SUBROUTINE resort_for_zy
977
978
979	!------------------------------------------------------------------------------!
980	! Description:cpu_log_nowait
981	! ------------
982	!> Transposition of input array (f_in) from z to y. For the input array, all
983	!> elements along z reside on the same PE, while after transposition, all
984	!> elements along y reside on the same PE.
985	!------------------------------------------------------------------------------!
986	SUBROUTINE transpose_zy( f_in, f_inv )
987
988
989	USE cpulog, &
990	ONLY: cpu_log, cpu_log_nowait, log_point_s
991
992	USE indices, &
993	ONLY: ny, nz
994
995	USE kinds
996
997	USE pegrid
998
999	USE transpose_indices, &
1000	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1001
1002	IMPLICIT NONE
1003
1004	INTEGER(iwp) :: i !<
1005	INTEGER(iwp) :: j !<
1006	INTEGER(iwp) :: k !<
1007	INTEGER(iwp) :: l !<
1008	INTEGER(iwp) :: zs !<
1009
1010	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1011	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1012
1013	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1014	#if __acc_fft_device
1015	!$ACC DECLARE CREATE(work)
1016	#endif
1017
1018	!
1019	!-- If the PE grid is one-dimensional along y, the array has only to be
1020	!-- reordered locally and therefore no transposition has to be done.
1021	IF ( pdims(1) /= 1 ) THEN
1022
1023	#if defined( __parallel )
1024	!
1025	!-- Reorder input array for transposition
1026	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1027	!$OMP DO
1028	DO l = 0, pdims(1) - 1
1029	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1030	#if __acc_fft_device
1031	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1032	!$ACC PRESENT(work, f_in)
1033	#endif
1034	DO j = nys_z, nyn_z
1035	DO k = zs, zs + nzt_y - nzb_y
1036	DO i = nxl_z, nxr_z
1037	work(i,k-zs+1,j,l) = f_in(i,j,k)
1038	ENDDO
1039	ENDDO
1040	ENDDO
1041	ENDDO
1042	!$OMP END PARALLEL
1043
1044	!
1045	!-- Transpose array
1046	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1047
1048	#if __acc_fft_device
1049	#ifndef __cuda_aware_mpi
1050	!$ACC UPDATE HOST(work)
1051	#else
1052	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1053	#endif
1054	#endif
1055
1056	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1057	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1058	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1059	comm1dx, ierr )
1060
1061	#if __acc_fft_device
1062	#ifndef __cuda_aware_mpi
1063	!$ACC UPDATE DEVICE(f_inv)
1064	#else
1065	!$ACC END HOST_DATA
1066	#endif
1067	#endif
1068
1069	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1070	#endif
1071
1072	ELSE
1073	!
1074	!-- Reorder the array in the same way like ALLTOALL did it
1075	!$OMP PARALLEL PRIVATE ( i, j, k )
1076	!$OMP DO
1077	#if __acc_fft_device
1078	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1079	!$ACC PRESENT(f_inv, f_in)
1080	#endif
1081	DO k = nzb_y, nzt_y
1082	DO j = 0, ny
1083	DO i = nxl_y, nxr_y
1084	f_inv(i,k,j) = f_in(i,j,k)
1085	ENDDO
1086	ENDDO
1087	ENDDO
1088	!$OMP END PARALLEL
1089
1090	ENDIF
1091
1092	END SUBROUTINE transpose_zy
1093
1094
1095	!------------------------------------------------------------------------------!
1096	! Description:
1097	! ------------
1098	!> Transposition of input array (f_in) from z to y. For the input array, all
1099	!> elements along z reside on the same PE, while after transposition, all
1100	!> elements along y reside on the same PE.
1101	!> This is a direct transposition for arrays with indices in regular order
1102	!> (k,j,i) (cf. transpose_zy).
1103	!------------------------------------------------------------------------------!
1104	SUBROUTINE transpose_zyd( f_in, f_out )
1105
1106
1107	USE cpulog, &
1108	ONLY: cpu_log, log_point_s
1109
1110	USE indices, &
1111	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1112
1113	USE kinds
1114
1115	USE pegrid
1116
1117	USE transpose_indices, &
1118	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1119
1120	IMPLICIT NONE
1121
1122	INTEGER(iwp) :: i !<
1123	INTEGER(iwp) :: j !<
1124	INTEGER(iwp) :: k !<
1125	INTEGER(iwp) :: l !<
1126	INTEGER(iwp) :: m !<
1127	INTEGER(iwp) :: ys !<
1128
1129	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1130	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1131	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1132	REAL(wp) :: work(nnxnnynnz) !<
1133
1134	#if defined( __parallel )
1135
1136	!
1137	!-- Rearrange indices of input array in order to make data to be send
1138	!-- by MPI contiguous
1139	DO i = nxl, nxr
1140	DO j = nys, nyn
1141	DO k = 1, nz
1142	f_inv(j,i,k) = f_in(k,j,i)
1143	ENDDO
1144	ENDDO
1145	ENDDO
1146
1147	!
1148	!-- Move data to different array, because memory location of work1 is
1149	!-- needed further below (work1 = work2).
1150	!-- If the PE grid is one-dimensional along x, only local reordering
1151	!-- of the data is necessary and no transposition has to be done.
1152	IF ( pdims(2) == 1 ) THEN
1153	DO k = 1, nz
1154	DO i = nxl, nxr
1155	DO j = nys, nyn
1156	f_out(j,i,k) = f_inv(j,i,k)
1157	ENDDO
1158	ENDDO
1159	ENDDO
1160	RETURN
1161	ENDIF
1162
1163	!
1164	!-- Transpose array
1165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1166	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1167	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1168	work(1), sendrecvcount_zyd, MPI_REAL, &
1169	comm1dy, ierr )
1170	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1171
1172	!
1173	!-- Reorder transposed array
1174	m = 0
1175	DO l = 0, pdims(2) - 1
1176	ys = 0 + l * nny
1177	DO k = nzb_yd, nzt_yd
1178	DO i = nxl_yd, nxr_yd
1179	DO j = ys, ys + nny - 1
1180	m = m + 1
1181	f_out(j,i,k) = work(m)
1182	ENDDO
1183	ENDDO
1184	ENDDO
1185	ENDDO
1186
1187	#endif
1188
1189	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |