Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3701

Last change on this file since 3701 was 3694, checked in by knoop, 6 years ago
Bugfix: moved OpenACC specific cpp #endif , so cpu_log call is not affected.
Property svn:keywords set to `Id`
File size: 33.9 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 3694 2019-01-23 17:01:49Z knoop $
27	! OpenACC port for SPEC
28	!
29	! 3241 2018-09-12 15:02:00Z raasch
30	! unused variables removed
31	!
32	! 2718 2018-01-02 08:49:38Z maronga
33	! Corrected "Former revisions" section
34	!
35	! 2696 2017-12-14 17:12:51Z kanani
36	! Change in file header (GPL part)
37	!
38	! 2119 2017-01-17 16:51:50Z raasch
39	!
40	! 2118 2017-01-17 16:38:49Z raasch
41	! OpenACC directives removed
42	!
43	! 2000 2016-08-20 18:09:15Z knoop
44	! Forced header and separation lines into 80 columns
45	!
46	! 1682 2015-10-07 23:56:08Z knoop
47	! Code annotations made doxygen readable
48	!
49	! 1324 2014-03-21 09:13:16Z suehring
50	! Bugfix: ONLY statement for module pegrid removed
51	!
52	! 1320 2014-03-20 08:40:49Z raasch
53	! ONLY-attribute added to USE-statements,
54	! kind-parameters added to all INTEGER and REAL declaration statements,
55	! kinds are defined in new module kinds,
56	! old module precision_kind is removed,
57	! revision history before 2012 removed,
58	! comment fields (!:) to be used for variable explanations added to
59	! all variable declaration statements
60	!
61	! 1318 2014-03-17 13:35:16Z raasch
62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
63	! required for solving the Poisson equation (poisfft),
64	! module interfaces removed
65	!
66	! 1257 2013-11-08 15:18:40Z raasch
67	! openacc loop and loop vector clauses removed
68	!
69	! 1216 2013-08-26 09:31:42Z raasch
70	! re-sorting of the transposed / to be transposed arrays moved to separate
71	! routines resort_for_...
72	!
73	! 1111 2013-03-08 23:54:10Z raasch
74	! openACC directives added,
75	! resorting data from/to work changed, work got 4 dimensions instead of 1
76	!
77	! 1106 2013-03-04 05:31:38Z raasch
78	! preprocessor lines rearranged so that routines can also be used in serial
79	! (non-parallel) mode
80	!
81	! 1092 2013-02-02 11:24:22Z raasch
82	! unused variables removed
83	!
84	! 1036 2012-10-22 13:43:42Z raasch
85	! code put under GPL (PALM 3.9)
86	!
87	! 1003 2012-09-14 14:35:53Z raasch
88	! indices nxa, nya, etc. replaced by nx, ny, etc.
89	!
90	! Revision 1.1 1997/07/24 11:25:18 raasch
91	! Initial revision
92	!
93
94	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
95
96	!------------------------------------------------------------------------------!
97	! Description:
98	! ------------
99	!> Resorting data for the transposition from x to y. The transposition itself
100	!> is carried out in transpose_xy
101	!------------------------------------------------------------------------------!
102	SUBROUTINE resort_for_xy( f_in, f_inv )
103
104
105	USE indices, &
106	ONLY: nx
107
108	USE kinds
109
110	USE transpose_indices, &
111	ONLY: nyn_x, nys_x, nzb_x, nzt_x
112
113	IMPLICIT NONE
114
115	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
116	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
117
118
119	INTEGER(iwp) :: i !<
120	INTEGER(iwp) :: j !<
121	INTEGER(iwp) :: k !<
122	!
123	!-- Rearrange indices of input array in order to make data to be send
124	!-- by MPI contiguous
125	!$OMP PARALLEL PRIVATE ( i, j, k )
126	!$OMP DO
127	#if __acc_fft_device
128	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
129	!$ACC PRESENT(f_inv, f_in)
130	#endif
131	DO i = 0, nx
132	DO k = nzb_x, nzt_x
133	DO j = nys_x, nyn_x
134	f_inv(j,k,i) = f_in(i,j,k)
135	ENDDO
136	ENDDO
137	ENDDO
138	!$OMP END PARALLEL
139
140	END SUBROUTINE resort_for_xy
141
142
143	!------------------------------------------------------------------------------!
144	! Description:
145	! ------------
146	!> Transposition of input array (f_in) from x to y. For the input array, all
147	!> elements along x reside on the same PE, while after transposition, all
148	!> elements along y reside on the same PE.
149	!------------------------------------------------------------------------------!
150	SUBROUTINE transpose_xy( f_inv, f_out )
151
152
153	USE cpulog, &
154	ONLY: cpu_log, cpu_log_nowait, log_point_s
155
156	USE indices, &
157	ONLY: nx, ny
158
159	USE kinds
160
161	USE pegrid
162
163	USE transpose_indices, &
164	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
165
166	IMPLICIT NONE
167
168	INTEGER(iwp) :: i !<
169	INTEGER(iwp) :: j !<
170	INTEGER(iwp) :: k !<
171	INTEGER(iwp) :: l !<
172	INTEGER(iwp) :: ys !<
173
174	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
175	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
176
177	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
178	#if __acc_fft_device
179	!$ACC DECLARE CREATE(work)
180	#endif
181
182
183	IF ( numprocs /= 1 ) THEN
184
185	#if defined( __parallel )
186	!
187	!-- Transpose array
188	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
189
190	#if __acc_fft_device
191	#ifndef __cuda_aware_mpi
192	!$ACC UPDATE HOST(f_inv)
193	#else
194	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
195	#endif
196	#endif
197
198	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
199	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
200	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
201	comm1dy, ierr )
202
203	#if __acc_fft_device
204	#ifndef __cuda_aware_mpi
205	!$ACC UPDATE DEVICE(work)
206	#else
207	!$ACC END HOST_DATA
208	#endif
209	#endif
210
211	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
212
213	!
214	!-- Reorder transposed array
215	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
216	!$OMP DO
217	DO l = 0, pdims(2) - 1
218	ys = 0 + l * ( nyn_x - nys_x + 1 )
219	#if __acc_fft_device
220	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
221	!$ACC PRESENT(f_out, work)
222	#endif
223	DO i = nxl_y, nxr_y
224	DO k = nzb_y, nzt_y
225	DO j = ys, ys + nyn_x - nys_x
226	f_out(j,i,k) = work(j-ys+1,k,i,l)
227	ENDDO
228	ENDDO
229	ENDDO
230	ENDDO
231	!$OMP END PARALLEL
232	#endif
233
234	ELSE
235
236	!
237	!-- Reorder transposed array
238	!$OMP PARALLEL PRIVATE ( i, j, k )
239	!$OMP DO
240	#if __acc_fft_device
241	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
242	!$ACC PRESENT(f_out, f_inv)
243	#endif
244	DO k = nzb_y, nzt_y
245	DO i = nxl_y, nxr_y
246	DO j = 0, ny
247	f_out(j,i,k) = f_inv(j,k,i)
248	ENDDO
249	ENDDO
250	ENDDO
251	!$OMP END PARALLEL
252
253	ENDIF
254
255	END SUBROUTINE transpose_xy
256
257
258	!------------------------------------------------------------------------------!
259	! Description:
260	! ------------
261	!> Resorting data after the transposition from x to z. The transposition itself
262	!> is carried out in transpose_xz
263	!------------------------------------------------------------------------------!
264	SUBROUTINE resort_for_xz( f_inv, f_out )
265
266
267	USE indices, &
268	ONLY: nxl, nxr, nyn, nys, nz
269
270	USE kinds
271
272	IMPLICIT NONE
273
274	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
275	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
276
277	INTEGER(iwp) :: i !<
278	INTEGER(iwp) :: j !<
279	INTEGER(iwp) :: k !<
280	!
281	!-- Rearrange indices of input array in order to make data to be send
282	!-- by MPI contiguous.
283	!-- In case of parallel fft/transposition, scattered store is faster in
284	!-- backward direction!!!
285	!$OMP PARALLEL PRIVATE ( i, j, k )
286	!$OMP DO
287	#if __acc_fft_device
288	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
289	!$ACC PRESENT(f_out, f_inv)
290	#endif
291	DO k = 1, nz
292	DO i = nxl, nxr
293	DO j = nys, nyn
294	f_out(k,j,i) = f_inv(j,i,k)
295	ENDDO
296	ENDDO
297	ENDDO
298	!$OMP END PARALLEL
299
300	END SUBROUTINE resort_for_xz
301
302
303	!------------------------------------------------------------------------------!
304	! Description:
305	! ------------
306	!> Transposition of input array (f_in) from x to z. For the input array, all
307	!> elements along x reside on the same PE, while after transposition, all
308	!> elements along z reside on the same PE.
309	!------------------------------------------------------------------------------!
310	SUBROUTINE transpose_xz( f_in, f_inv )
311
312
313	USE cpulog, &
314	ONLY: cpu_log, cpu_log_nowait, log_point_s
315
316	USE indices, &
317	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
318
319	USE kinds
320
321	USE pegrid
322
323	USE transpose_indices, &
324	ONLY: nyn_x, nys_x, nzb_x, nzt_x
325
326	IMPLICIT NONE
327
328	INTEGER(iwp) :: i !<
329	INTEGER(iwp) :: j !<
330	INTEGER(iwp) :: k !<
331	INTEGER(iwp) :: l !<
332	INTEGER(iwp) :: xs !<
333
334	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
335	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
336
337	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
338	#if __acc_fft_device
339	!$ACC DECLARE CREATE(work)
340	#endif
341
342
343	!
344	!-- If the PE grid is one-dimensional along y, the array has only to be
345	!-- reordered locally and therefore no transposition has to be done.
346	IF ( pdims(1) /= 1 ) THEN
347
348	#if defined( __parallel )
349	!
350	!-- Reorder input array for transposition
351	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
352	!$OMP DO
353	DO l = 0, pdims(1) - 1
354	xs = 0 + l * nnx
355	#if __acc_fft_device
356	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
357	!$ACC PRESENT(work, f_in)
358	#endif
359	DO k = nzb_x, nzt_x
360	DO i = xs, xs + nnx - 1
361	DO j = nys_x, nyn_x
362	work(j,i-xs+1,k,l) = f_in(i,j,k)
363	ENDDO
364	ENDDO
365	ENDDO
366	ENDDO
367	!$OMP END PARALLEL
368
369	!
370	!-- Transpose array
371	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
372
373	#if __acc_fft_device
374	#ifndef __cuda_aware_mpi
375	!$ACC UPDATE HOST(work)
376	#else
377	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
378	#endif
379	#endif
380
381	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
382	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
383	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
384	comm1dx, ierr )
385
386	#if __acc_fft_device
387	#ifndef __cuda_aware_mpi
388	!$ACC UPDATE DEVICE(f_inv)
389	#else
390	!$ACC END HOST_DATA
391	#endif
392	#endif
393
394	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
395	#endif
396
397	ELSE
398
399	!
400	!-- Reorder the array in a way that the z index is in first position
401	!$OMP PARALLEL PRIVATE ( i, j, k )
402	!$OMP DO
403	#if __acc_fft_device
404	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
405	!$ACC PRESENT(f_inv, f_in)
406	#endif
407	DO i = nxl, nxr
408	DO j = nys, nyn
409	DO k = 1, nz
410	f_inv(j,i,k) = f_in(i,j,k)
411	ENDDO
412	ENDDO
413	ENDDO
414	!$OMP END PARALLEL
415
416	ENDIF
417
418	END SUBROUTINE transpose_xz
419
420
421	!------------------------------------------------------------------------------!
422	! Description:
423	! ------------
424	!> Resorting data after the transposition from y to x. The transposition itself
425	!> is carried out in transpose_yx
426	!------------------------------------------------------------------------------!
427	SUBROUTINE resort_for_yx( f_inv, f_out )
428
429
430	USE indices, &
431	ONLY: nx
432
433	USE kinds
434
435	USE transpose_indices, &
436	ONLY: nyn_x, nys_x, nzb_x, nzt_x
437
438	IMPLICIT NONE
439
440	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
441	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
442
443
444	INTEGER(iwp) :: i !<
445	INTEGER(iwp) :: j !<
446	INTEGER(iwp) :: k !<
447	!
448	!-- Rearrange indices of input array in order to make data to be send
449	!-- by MPI contiguous
450	!$OMP PARALLEL PRIVATE ( i, j, k )
451	!$OMP DO
452	#if __acc_fft_device
453	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
454	!$ACC PRESENT(f_out, f_inv)
455	#endif
456	DO i = 0, nx
457	DO k = nzb_x, nzt_x
458	DO j = nys_x, nyn_x
459	f_out(i,j,k) = f_inv(j,k,i)
460	ENDDO
461	ENDDO
462	ENDDO
463	!$OMP END PARALLEL
464
465	END SUBROUTINE resort_for_yx
466
467
468	!------------------------------------------------------------------------------!
469	! Description:
470	! ------------
471	!> Transposition of input array (f_in) from y to x. For the input array, all
472	!> elements along y reside on the same PE, while after transposition, all
473	!> elements along x reside on the same PE.
474	!------------------------------------------------------------------------------!
475	SUBROUTINE transpose_yx( f_in, f_inv )
476
477
478	USE cpulog, &
479	ONLY: cpu_log, cpu_log_nowait, log_point_s
480
481	USE indices, &
482	ONLY: nx, ny
483
484	USE kinds
485
486	USE pegrid
487
488	USE transpose_indices, &
489	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
490
491	IMPLICIT NONE
492
493	INTEGER(iwp) :: i !<
494	INTEGER(iwp) :: j !<
495	INTEGER(iwp) :: k !<
496	INTEGER(iwp) :: l !<
497	INTEGER(iwp) :: ys !<
498
499	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
500	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
501
502	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
503	#if __acc_fft_device
504	!$ACC DECLARE CREATE(work)
505	#endif
506
507
508	IF ( numprocs /= 1 ) THEN
509
510	#if defined( __parallel )
511	!
512	!-- Reorder input array for transposition
513	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
514	!$OMP DO
515	DO l = 0, pdims(2) - 1
516	ys = 0 + l * ( nyn_x - nys_x + 1 )
517	#if __acc_fft_device
518	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
519	!$ACC PRESENT(work, f_in)
520	#endif
521	DO i = nxl_y, nxr_y
522	DO k = nzb_y, nzt_y
523	DO j = ys, ys + nyn_x - nys_x
524	work(j-ys+1,k,i,l) = f_in(j,i,k)
525	ENDDO
526	ENDDO
527	ENDDO
528	ENDDO
529	!$OMP END PARALLEL
530
531	!
532	!-- Transpose array
533	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
534
535	#if __acc_fft_device
536	#ifndef __cuda_aware_mpi
537	!$ACC UPDATE HOST(work)
538	#else
539	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
540	#endif
541	#endif
542
543	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
544	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
545	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
546	comm1dy, ierr )
547
548	#if __acc_fft_device
549	#ifndef __cuda_aware_mpi
550	!$ACC UPDATE DEVICE(f_inv)
551	#else
552	!$ACC END HOST_DATA
553	#endif
554	#endif
555
556	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
557	#endif
558
559	ELSE
560
561	!
562	!-- Reorder array f_in the same way as ALLTOALL did it
563	!$OMP PARALLEL PRIVATE ( i, j, k )
564	!$OMP DO
565	#if __acc_fft_device
566	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
567	!$ACC PRESENT(f_inv, f_in)
568	#endif
569	DO i = nxl_y, nxr_y
570	DO k = nzb_y, nzt_y
571	DO j = 0, ny
572	f_inv(j,k,i) = f_in(j,i,k)
573	ENDDO
574	ENDDO
575	ENDDO
576	!$OMP END PARALLEL
577
578	ENDIF
579
580	END SUBROUTINE transpose_yx
581
582
583	!------------------------------------------------------------------------------!
584	! Description:
585	! ------------
586	!> Transposition of input array (f_in) from y to x. For the input array, all
587	!> elements along y reside on the same PE, while after transposition, all
588	!> elements along x reside on the same PE.
589	!> This is a direct transposition for arrays with indices in regular order
590	!> (k,j,i) (cf. transpose_yx).
591	!------------------------------------------------------------------------------!
592	SUBROUTINE transpose_yxd( f_in, f_out )
593
594
595	USE cpulog, &
596	ONLY: cpu_log, log_point_s
597
598	USE indices, &
599	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
600
601	USE kinds
602
603	USE pegrid
604
605	USE transpose_indices, &
606	ONLY: nyn_x, nys_x, nzb_x, nzt_x
607
608	IMPLICIT NONE
609
610	INTEGER(iwp) :: i !<
611	INTEGER(iwp) :: j !<
612	INTEGER(iwp) :: k !<
613	INTEGER(iwp) :: l !<
614	INTEGER(iwp) :: m !<
615	INTEGER(iwp) :: xs !<
616
617	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
618	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
619	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
620	REAL(wp) :: work(nnxnnynnz) !<
621	#if defined( __parallel )
622
623	!
624	!-- Rearrange indices of input array in order to make data to be send
625	!-- by MPI contiguous
626	DO k = 1, nz
627	DO j = nys, nyn
628	DO i = nxl, nxr
629	f_inv(i,k,j) = f_in(k,j,i)
630	ENDDO
631	ENDDO
632	ENDDO
633
634	!
635	!-- Transpose array
636	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
637	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
638	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
639	work(1), sendrecvcount_xy, MPI_REAL, &
640	comm1dx, ierr )
641	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
642
643	!
644	!-- Reorder transposed array
645	m = 0
646	DO l = 0, pdims(1) - 1
647	xs = 0 + l * nnx
648	DO j = nys_x, nyn_x
649	DO k = 1, nz
650	DO i = xs, xs + nnx - 1
651	m = m + 1
652	f_out(i,j,k) = work(m)
653	ENDDO
654	ENDDO
655	ENDDO
656	ENDDO
657
658	#endif
659
660	END SUBROUTINE transpose_yxd
661
662
663	!------------------------------------------------------------------------------!
664	! Description:
665	! ------------
666	!> Resorting data for the transposition from y to z. The transposition itself
667	!> is carried out in transpose_yz
668	!------------------------------------------------------------------------------!
669	SUBROUTINE resort_for_yz( f_in, f_inv )
670
671
672	USE indices, &
673	ONLY: ny
674
675	USE kinds
676
677	USE transpose_indices, &
678	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
679
680	IMPLICIT NONE
681
682	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
683	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
684
685	INTEGER(iwp) :: i !<
686	INTEGER(iwp) :: j !<
687	INTEGER(iwp) :: k !<
688
689	!
690	!-- Rearrange indices of input array in order to make data to be send
691	!-- by MPI contiguous
692	!$OMP PARALLEL PRIVATE ( i, j, k )
693	!$OMP DO
694	#if __acc_fft_device
695	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
696	!$ACC PRESENT(f_inv, f_in)
697	#endif
698	DO j = 0, ny
699	DO k = nzb_y, nzt_y
700	DO i = nxl_y, nxr_y
701	f_inv(i,k,j) = f_in(j,i,k)
702	ENDDO
703	ENDDO
704	ENDDO
705	!$OMP END PARALLEL
706
707	END SUBROUTINE resort_for_yz
708
709
710	!------------------------------------------------------------------------------!
711	! Description:
712	! ------------
713	!> Transposition of input array (f_in) from y to z. For the input array, all
714	!> elements along y reside on the same PE, while after transposition, all
715	!> elements along z reside on the same PE.
716	!------------------------------------------------------------------------------!
717	SUBROUTINE transpose_yz( f_inv, f_out )
718
719
720	USE cpulog, &
721	ONLY: cpu_log, cpu_log_nowait, log_point_s
722
723	USE indices, &
724	ONLY: ny, nz
725
726	USE kinds
727
728	USE pegrid
729
730	USE transpose_indices, &
731	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
732
733	IMPLICIT NONE
734
735	INTEGER(iwp) :: i !<
736	INTEGER(iwp) :: j !<
737	INTEGER(iwp) :: k !<
738	INTEGER(iwp) :: l !<
739	INTEGER(iwp) :: zs !<
740
741	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
742	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
743
744	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
745	#if __acc_fft_device
746	!$ACC DECLARE CREATE(work)
747	#endif
748
749
750	!
751	!-- If the PE grid is one-dimensional along y, only local reordering
752	!-- of the data is necessary and no transposition has to be done.
753	IF ( pdims(1) == 1 ) THEN
754
755	!$OMP PARALLEL PRIVATE ( i, j, k )
756	!$OMP DO
757	#if __acc_fft_device
758	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
759	!$ACC PRESENT(f_out, f_inv)
760	#endif
761	DO j = 0, ny
762	DO k = nzb_y, nzt_y
763	DO i = nxl_y, nxr_y
764	f_out(i,j,k) = f_inv(i,k,j)
765	ENDDO
766	ENDDO
767	ENDDO
768	!$OMP END PARALLEL
769
770	ELSE
771
772	#if defined( __parallel )
773	!
774	!-- Transpose array
775	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
776
777	#if __acc_fft_device
778	#ifndef __cuda_aware_mpi
779	!$ACC UPDATE HOST(f_inv)
780	#else
781	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
782	#endif
783	#endif
784
785	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
786	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
787	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
788	comm1dx, ierr )
789
790	#if __acc_fft_device
791	#ifndef __cuda_aware_mpi
792	!$ACC UPDATE DEVICE(work)
793	#else
794	!$ACC END HOST_DATA
795	#endif
796	#endif
797
798	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
799
800	!
801	!-- Reorder transposed array
802	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
803	!$OMP DO
804	DO l = 0, pdims(1) - 1
805	zs = 1 + l * ( nzt_y - nzb_y + 1 )
806	#if __acc_fft_device
807	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
808	!$ACC PRESENT(f_out, work)
809	#endif
810	DO j = nys_z, nyn_z
811	DO k = zs, zs + nzt_y - nzb_y
812	DO i = nxl_z, nxr_z
813	f_out(i,j,k) = work(i,k-zs+1,j,l)
814	ENDDO
815	ENDDO
816	ENDDO
817	ENDDO
818	!$OMP END PARALLEL
819	#endif
820
821	ENDIF
822
823	END SUBROUTINE transpose_yz
824
825
826	!------------------------------------------------------------------------------!
827	! Description:
828	! ------------
829	!> Resorting data for the transposition from z to x. The transposition itself
830	!> is carried out in transpose_zx
831	!------------------------------------------------------------------------------!
832	SUBROUTINE resort_for_zx( f_in, f_inv )
833
834
835	USE indices, &
836	ONLY: nxl, nxr, nyn, nys, nz
837
838	USE kinds
839
840	IMPLICIT NONE
841
842	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
843	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
844
845	INTEGER(iwp) :: i !<
846	INTEGER(iwp) :: j !<
847	INTEGER(iwp) :: k !<
848
849	!
850	!-- Rearrange indices of input array in order to make data to be send
851	!-- by MPI contiguous
852	!$OMP PARALLEL PRIVATE ( i, j, k )
853	!$OMP DO
854	#if __acc_fft_device
855	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
856	!$ACC PRESENT(f_in, f_inv)
857	#endif
858	DO k = 1,nz
859	DO i = nxl, nxr
860	DO j = nys, nyn
861	f_inv(j,i,k) = f_in(k,j,i)
862	ENDDO
863	ENDDO
864	ENDDO
865	!$OMP END PARALLEL
866
867	END SUBROUTINE resort_for_zx
868
869
870	!------------------------------------------------------------------------------!
871	! Description:
872	! ------------
873	!> Transposition of input array (f_in) from z to x. For the input array, all
874	!> elements along z reside on the same PE, while after transposition, all
875	!> elements along x reside on the same PE.
876	!------------------------------------------------------------------------------!
877	SUBROUTINE transpose_zx( f_inv, f_out )
878
879
880	USE cpulog, &
881	ONLY: cpu_log, cpu_log_nowait, log_point_s
882
883	USE indices, &
884	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
885
886	USE kinds
887
888	USE pegrid
889
890	USE transpose_indices, &
891	ONLY: nyn_x, nys_x, nzb_x, nzt_x
892
893	IMPLICIT NONE
894
895	INTEGER(iwp) :: i !<
896	INTEGER(iwp) :: j !<
897	INTEGER(iwp) :: k !<
898	INTEGER(iwp) :: l !<
899	INTEGER(iwp) :: xs !<
900
901	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
902	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
903
904	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
905	#if __acc_fft_device
906	!$ACC DECLARE CREATE(work)
907	#endif
908
909
910	!
911	!-- If the PE grid is one-dimensional along y, only local reordering
912	!-- of the data is necessary and no transposition has to be done.
913	IF ( pdims(1) == 1 ) THEN
914
915	!$OMP PARALLEL PRIVATE ( i, j, k )
916	!$OMP DO
917	#if __acc_fft_device
918	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
919	!$ACC PRESENT(f_out, f_inv)
920	#endif
921	DO k = 1, nz
922	DO i = nxl, nxr
923	DO j = nys, nyn
924	f_out(i,j,k) = f_inv(j,i,k)
925	ENDDO
926	ENDDO
927	ENDDO
928	!$OMP END PARALLEL
929
930	ELSE
931
932	#if defined( __parallel )
933	!
934	!-- Transpose array
935	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
936
937	#if __acc_fft_device
938	#ifndef __cuda_aware_mpi
939	!$ACC UPDATE HOST(f_inv)
940	#else
941	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
942	#endif
943	#endif
944
945	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
946	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
947	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
948	comm1dx, ierr )
949
950	#if __acc_fft_device
951	#ifndef __cuda_aware_mpi
952	!$ACC UPDATE DEVICE(work)
953	#else
954	!$ACC END HOST_DATA
955	#endif
956	#endif
957
958	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
959
960	!
961	!-- Reorder transposed array
962	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
963	!$OMP DO
964	DO l = 0, pdims(1) - 1
965	xs = 0 + l * nnx
966	#if __acc_fft_device
967	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
968	!$ACC PRESENT(f_out, work)
969	#endif
970	DO k = nzb_x, nzt_x
971	DO i = xs, xs + nnx - 1
972	DO j = nys_x, nyn_x
973	f_out(i,j,k) = work(j,i-xs+1,k,l)
974	ENDDO
975	ENDDO
976	ENDDO
977	ENDDO
978	!$OMP END PARALLEL
979	#endif
980
981	ENDIF
982
983	END SUBROUTINE transpose_zx
984
985
986	!------------------------------------------------------------------------------!
987	! Description:
988	! ------------
989	!> Resorting data after the transposition from z to y. The transposition itself
990	!> is carried out in transpose_zy
991	!------------------------------------------------------------------------------!
992	SUBROUTINE resort_for_zy( f_inv, f_out )
993
994
995	USE indices, &
996	ONLY: ny
997
998	USE kinds
999
1000	USE transpose_indices, &
1001	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1002
1003	IMPLICIT NONE
1004
1005	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1006	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1007
1008
1009	INTEGER(iwp) :: i !<
1010	INTEGER(iwp) :: j !<
1011	INTEGER(iwp) :: k !<
1012
1013	!
1014	!-- Rearrange indices of input array in order to make data to be send
1015	!-- by MPI contiguous
1016	!$OMP PARALLEL PRIVATE ( i, j, k )
1017	!$OMP DO
1018	#if __acc_fft_device
1019	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1020	!$ACC PRESENT(f_out, f_inv)
1021	#endif
1022	DO k = nzb_y, nzt_y
1023	DO j = 0, ny
1024	DO i = nxl_y, nxr_y
1025	f_out(j,i,k) = f_inv(i,k,j)
1026	ENDDO
1027	ENDDO
1028	ENDDO
1029	!$OMP END PARALLEL
1030
1031	END SUBROUTINE resort_for_zy
1032
1033
1034	!------------------------------------------------------------------------------!
1035	! Description:cpu_log_nowait
1036	! ------------
1037	!> Transposition of input array (f_in) from z to y. For the input array, all
1038	!> elements along z reside on the same PE, while after transposition, all
1039	!> elements along y reside on the same PE.
1040	!------------------------------------------------------------------------------!
1041	SUBROUTINE transpose_zy( f_in, f_inv )
1042
1043
1044	USE cpulog, &
1045	ONLY: cpu_log, cpu_log_nowait, log_point_s
1046
1047	USE indices, &
1048	ONLY: ny, nz
1049
1050	USE kinds
1051
1052	USE pegrid
1053
1054	USE transpose_indices, &
1055	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1056
1057	IMPLICIT NONE
1058
1059	INTEGER(iwp) :: i !<
1060	INTEGER(iwp) :: j !<
1061	INTEGER(iwp) :: k !<
1062	INTEGER(iwp) :: l !<
1063	INTEGER(iwp) :: zs !<
1064
1065	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1066	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1067
1068	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1069	#if __acc_fft_device
1070	!$ACC DECLARE CREATE(work)
1071	#endif
1072
1073	!
1074	!-- If the PE grid is one-dimensional along y, the array has only to be
1075	!-- reordered locally and therefore no transposition has to be done.
1076	IF ( pdims(1) /= 1 ) THEN
1077
1078	#if defined( __parallel )
1079	!
1080	!-- Reorder input array for transposition
1081	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1082	!$OMP DO
1083	DO l = 0, pdims(1) - 1
1084	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1085	#if __acc_fft_device
1086	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1087	!$ACC PRESENT(work, f_in)
1088	#endif
1089	DO j = nys_z, nyn_z
1090	DO k = zs, zs + nzt_y - nzb_y
1091	DO i = nxl_z, nxr_z
1092	work(i,k-zs+1,j,l) = f_in(i,j,k)
1093	ENDDO
1094	ENDDO
1095	ENDDO
1096	ENDDO
1097	!$OMP END PARALLEL
1098
1099	!
1100	!-- Transpose array
1101	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1102
1103	#if __acc_fft_device
1104	#ifndef __cuda_aware_mpi
1105	!$ACC UPDATE HOST(work)
1106	#else
1107	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1108	#endif
1109	#endif
1110
1111	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1112	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1113	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1114	comm1dx, ierr )
1115
1116	#if __acc_fft_device
1117	#ifndef __cuda_aware_mpi
1118	!$ACC UPDATE DEVICE(f_inv)
1119	#else
1120	!$ACC END HOST_DATA
1121	#endif
1122	#endif
1123
1124	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1125	#endif
1126
1127	ELSE
1128	!
1129	!-- Reorder the array in the same way like ALLTOALL did it
1130	!$OMP PARALLEL PRIVATE ( i, j, k )
1131	!$OMP DO
1132	#if __acc_fft_device
1133	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1134	!$ACC PRESENT(f_inv, f_in)
1135	#endif
1136	DO k = nzb_y, nzt_y
1137	DO j = 0, ny
1138	DO i = nxl_y, nxr_y
1139	f_inv(i,k,j) = f_in(i,j,k)
1140	ENDDO
1141	ENDDO
1142	ENDDO
1143	!$OMP END PARALLEL
1144
1145	ENDIF
1146
1147	END SUBROUTINE transpose_zy
1148
1149
1150	!------------------------------------------------------------------------------!
1151	! Description:
1152	! ------------
1153	!> Transposition of input array (f_in) from z to y. For the input array, all
1154	!> elements along z reside on the same PE, while after transposition, all
1155	!> elements along y reside on the same PE.
1156	!> This is a direct transposition for arrays with indices in regular order
1157	!> (k,j,i) (cf. transpose_zy).
1158	!------------------------------------------------------------------------------!
1159	SUBROUTINE transpose_zyd( f_in, f_out )
1160
1161
1162	USE cpulog, &
1163	ONLY: cpu_log, log_point_s
1164
1165	USE indices, &
1166	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1167
1168	USE kinds
1169
1170	USE pegrid
1171
1172	USE transpose_indices, &
1173	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1174
1175	IMPLICIT NONE
1176
1177	INTEGER(iwp) :: i !<
1178	INTEGER(iwp) :: j !<
1179	INTEGER(iwp) :: k !<
1180	INTEGER(iwp) :: l !<
1181	INTEGER(iwp) :: m !<
1182	INTEGER(iwp) :: ys !<
1183
1184	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1185	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1186	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1187	REAL(wp) :: work(nnxnnynnz) !<
1188
1189	#if defined( __parallel )
1190
1191	!
1192	!-- Rearrange indices of input array in order to make data to be send
1193	!-- by MPI contiguous
1194	DO i = nxl, nxr
1195	DO j = nys, nyn
1196	DO k = 1, nz
1197	f_inv(j,i,k) = f_in(k,j,i)
1198	ENDDO
1199	ENDDO
1200	ENDDO
1201
1202	!
1203	!-- Move data to different array, because memory location of work1 is
1204	!-- needed further below (work1 = work2).
1205	!-- If the PE grid is one-dimensional along x, only local reordering
1206	!-- of the data is necessary and no transposition has to be done.
1207	IF ( pdims(2) == 1 ) THEN
1208	DO k = 1, nz
1209	DO i = nxl, nxr
1210	DO j = nys, nyn
1211	f_out(j,i,k) = f_inv(j,i,k)
1212	ENDDO
1213	ENDDO
1214	ENDDO
1215	RETURN
1216	ENDIF
1217
1218	!
1219	!-- Transpose array
1220	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1221	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1222	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1223	work(1), sendrecvcount_zyd, MPI_REAL, &
1224	comm1dy, ierr )
1225	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1226
1227	!
1228	!-- Reorder transposed array
1229	m = 0
1230	DO l = 0, pdims(2) - 1
1231	ys = 0 + l * nny
1232	DO k = nzb_yd, nzt_yd
1233	DO i = nxl_yd, nxr_yd
1234	DO j = ys, ys + nny - 1
1235	m = m + 1
1236	f_out(j,i,k) = work(m)
1237	ENDDO
1238	ENDDO
1239	ENDDO
1240	ENDDO
1241
1242	#endif
1243
1244	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |