Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4021

Last change on this file since 4021 was 3832, checked in by raasch, 6 years ago
some routines instrumented with openmp directives, loop reordering for performance optimization
Property svn:keywords set to `Id`
File size: 34.0 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 3832 2019-03-28 13:16:58Z maronga $
27	! loop reordering for performance optimization
28	!
29	! 3694 2019-01-23 17:01:49Z knoop
30	! OpenACC port for SPEC
31	!
32	! 3241 2018-09-12 15:02:00Z raasch
33	! unused variables removed
34	!
35	! 2718 2018-01-02 08:49:38Z maronga
36	! Corrected "Former revisions" section
37	!
38	! 2696 2017-12-14 17:12:51Z kanani
39	! Change in file header (GPL part)
40	!
41	! 2119 2017-01-17 16:51:50Z raasch
42	!
43	! 2118 2017-01-17 16:38:49Z raasch
44	! OpenACC directives removed
45	!
46	! 2000 2016-08-20 18:09:15Z knoop
47	! Forced header and separation lines into 80 columns
48	!
49	! 1682 2015-10-07 23:56:08Z knoop
50	! Code annotations made doxygen readable
51	!
52	! 1324 2014-03-21 09:13:16Z suehring
53	! Bugfix: ONLY statement for module pegrid removed
54	!
55	! 1320 2014-03-20 08:40:49Z raasch
56	! ONLY-attribute added to USE-statements,
57	! kind-parameters added to all INTEGER and REAL declaration statements,
58	! kinds are defined in new module kinds,
59	! old module precision_kind is removed,
60	! revision history before 2012 removed,
61	! comment fields (!:) to be used for variable explanations added to
62	! all variable declaration statements
63	!
64	! 1318 2014-03-17 13:35:16Z raasch
65	! cpu_log_nowait parameter added to cpu measurements of the transpositions
66	! required for solving the Poisson equation (poisfft),
67	! module interfaces removed
68	!
69	! 1257 2013-11-08 15:18:40Z raasch
70	! openacc loop and loop vector clauses removed
71	!
72	! 1216 2013-08-26 09:31:42Z raasch
73	! re-sorting of the transposed / to be transposed arrays moved to separate
74	! routines resort_for_...
75	!
76	! 1111 2013-03-08 23:54:10Z raasch
77	! openACC directives added,
78	! resorting data from/to work changed, work got 4 dimensions instead of 1
79	!
80	! 1106 2013-03-04 05:31:38Z raasch
81	! preprocessor lines rearranged so that routines can also be used in serial
82	! (non-parallel) mode
83	!
84	! 1092 2013-02-02 11:24:22Z raasch
85	! unused variables removed
86	!
87	! 1036 2012-10-22 13:43:42Z raasch
88	! code put under GPL (PALM 3.9)
89	!
90	! 1003 2012-09-14 14:35:53Z raasch
91	! indices nxa, nya, etc. replaced by nx, ny, etc.
92	!
93	! Revision 1.1 1997/07/24 11:25:18 raasch
94	! Initial revision
95	!
96
97	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
98
99	!------------------------------------------------------------------------------!
100	! Description:
101	! ------------
102	!> Resorting data for the transposition from x to y. The transposition itself
103	!> is carried out in transpose_xy
104	!------------------------------------------------------------------------------!
105	SUBROUTINE resort_for_xy( f_in, f_inv )
106
107
108	USE indices, &
109	ONLY: nx
110
111	USE kinds
112
113	USE transpose_indices, &
114	ONLY: nyn_x, nys_x, nzb_x, nzt_x
115
116	IMPLICIT NONE
117
118	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
119	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
120
121
122	INTEGER(iwp) :: i !<
123	INTEGER(iwp) :: j !<
124	INTEGER(iwp) :: k !<
125	!
126	!-- Rearrange indices of input array in order to make data to be send
127	!-- by MPI contiguous
128	!$OMP PARALLEL PRIVATE ( i, j, k )
129	!$OMP DO
130	#if __acc_fft_device
131	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
132	!$ACC PRESENT(f_inv, f_in)
133	#endif
134	DO k = nzb_x, nzt_x
135	DO i = 0, nx
136	DO j = nys_x, nyn_x
137	f_inv(j,k,i) = f_in(i,j,k)
138	ENDDO
139	ENDDO
140	ENDDO
141	!$OMP END PARALLEL
142
143	END SUBROUTINE resort_for_xy
144
145
146	!------------------------------------------------------------------------------!
147	! Description:
148	! ------------
149	!> Transposition of input array (f_in) from x to y. For the input array, all
150	!> elements along x reside on the same PE, while after transposition, all
151	!> elements along y reside on the same PE.
152	!------------------------------------------------------------------------------!
153	SUBROUTINE transpose_xy( f_inv, f_out )
154
155
156	USE cpulog, &
157	ONLY: cpu_log, cpu_log_nowait, log_point_s
158
159	USE indices, &
160	ONLY: nx, ny
161
162	USE kinds
163
164	USE pegrid
165
166	USE transpose_indices, &
167	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
168
169	IMPLICIT NONE
170
171	INTEGER(iwp) :: i !<
172	INTEGER(iwp) :: j !<
173	INTEGER(iwp) :: k !<
174	INTEGER(iwp) :: l !<
175	INTEGER(iwp) :: ys !<
176
177	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
178	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
179
180	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
181	#if __acc_fft_device
182	!$ACC DECLARE CREATE(work)
183	#endif
184
185
186	IF ( numprocs /= 1 ) THEN
187
188	#if defined( __parallel )
189	!
190	!-- Transpose array
191	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
192
193	#if __acc_fft_device
194	#ifndef __cuda_aware_mpi
195	!$ACC UPDATE HOST(f_inv)
196	#else
197	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
198	#endif
199	#endif
200
201	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
202	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
203	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
204	comm1dy, ierr )
205
206	#if __acc_fft_device
207	#ifndef __cuda_aware_mpi
208	!$ACC UPDATE DEVICE(work)
209	#else
210	!$ACC END HOST_DATA
211	#endif
212	#endif
213
214	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
215
216	!
217	!-- Reorder transposed array
218	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
219	!$OMP DO
220	DO l = 0, pdims(2) - 1
221	ys = 0 + l * ( nyn_x - nys_x + 1 )
222	#if __acc_fft_device
223	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
224	!$ACC PRESENT(f_out, work)
225	#endif
226	DO i = nxl_y, nxr_y
227	DO k = nzb_y, nzt_y
228	DO j = ys, ys + nyn_x - nys_x
229	f_out(j,i,k) = work(j-ys+1,k,i,l)
230	ENDDO
231	ENDDO
232	ENDDO
233	ENDDO
234	!$OMP END PARALLEL
235	#endif
236
237	ELSE
238
239	!
240	!-- Reorder transposed array
241	!$OMP PARALLEL PRIVATE ( i, j, k )
242	!$OMP DO
243	#if __acc_fft_device
244	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
245	!$ACC PRESENT(f_out, f_inv)
246	#endif
247	DO k = nzb_y, nzt_y
248	DO i = nxl_y, nxr_y
249	DO j = 0, ny
250	f_out(j,i,k) = f_inv(j,k,i)
251	ENDDO
252	ENDDO
253	ENDDO
254	!$OMP END PARALLEL
255
256	ENDIF
257
258	END SUBROUTINE transpose_xy
259
260
261	!------------------------------------------------------------------------------!
262	! Description:
263	! ------------
264	!> Resorting data after the transposition from x to z. The transposition itself
265	!> is carried out in transpose_xz
266	!------------------------------------------------------------------------------!
267	SUBROUTINE resort_for_xz( f_inv, f_out )
268
269
270	USE indices, &
271	ONLY: nxl, nxr, nyn, nys, nz
272
273	USE kinds
274
275	IMPLICIT NONE
276
277	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
278	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
279
280	INTEGER(iwp) :: i !<
281	INTEGER(iwp) :: j !<
282	INTEGER(iwp) :: k !<
283	!
284	!-- Rearrange indices of input array in order to make data to be send
285	!-- by MPI contiguous.
286	!-- In case of parallel fft/transposition, scattered store is faster in
287	!-- backward direction!!!
288	!$OMP PARALLEL PRIVATE ( i, j, k )
289	!$OMP DO
290	#if __acc_fft_device
291	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
292	!$ACC PRESENT(f_out, f_inv)
293	#endif
294	DO k = 1, nz
295	DO i = nxl, nxr
296	DO j = nys, nyn
297	f_out(k,j,i) = f_inv(j,i,k)
298	ENDDO
299	ENDDO
300	ENDDO
301	!$OMP END PARALLEL
302
303	END SUBROUTINE resort_for_xz
304
305
306	!------------------------------------------------------------------------------!
307	! Description:
308	! ------------
309	!> Transposition of input array (f_in) from x to z. For the input array, all
310	!> elements along x reside on the same PE, while after transposition, all
311	!> elements along z reside on the same PE.
312	!------------------------------------------------------------------------------!
313	SUBROUTINE transpose_xz( f_in, f_inv )
314
315
316	USE cpulog, &
317	ONLY: cpu_log, cpu_log_nowait, log_point_s
318
319	USE indices, &
320	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
321
322	USE kinds
323
324	USE pegrid
325
326	USE transpose_indices, &
327	ONLY: nyn_x, nys_x, nzb_x, nzt_x
328
329	IMPLICIT NONE
330
331	INTEGER(iwp) :: i !<
332	INTEGER(iwp) :: j !<
333	INTEGER(iwp) :: k !<
334	INTEGER(iwp) :: l !<
335	INTEGER(iwp) :: xs !<
336
337	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
338	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
339
340	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
341	#if __acc_fft_device
342	!$ACC DECLARE CREATE(work)
343	#endif
344
345
346	!
347	!-- If the PE grid is one-dimensional along y, the array has only to be
348	!-- reordered locally and therefore no transposition has to be done.
349	IF ( pdims(1) /= 1 ) THEN
350
351	#if defined( __parallel )
352	!
353	!-- Reorder input array for transposition
354	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
355	!$OMP DO
356	DO l = 0, pdims(1) - 1
357	xs = 0 + l * nnx
358	#if __acc_fft_device
359	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
360	!$ACC PRESENT(work, f_in)
361	#endif
362	DO k = nzb_x, nzt_x
363	DO i = xs, xs + nnx - 1
364	DO j = nys_x, nyn_x
365	work(j,i-xs+1,k,l) = f_in(i,j,k)
366	ENDDO
367	ENDDO
368	ENDDO
369	ENDDO
370	!$OMP END PARALLEL
371
372	!
373	!-- Transpose array
374	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
375
376	#if __acc_fft_device
377	#ifndef __cuda_aware_mpi
378	!$ACC UPDATE HOST(work)
379	#else
380	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
381	#endif
382	#endif
383
384	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
385	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
386	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
387	comm1dx, ierr )
388
389	#if __acc_fft_device
390	#ifndef __cuda_aware_mpi
391	!$ACC UPDATE DEVICE(f_inv)
392	#else
393	!$ACC END HOST_DATA
394	#endif
395	#endif
396
397	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
398	#endif
399
400	ELSE
401
402	!
403	!-- Reorder the array in a way that the z index is in first position
404	!$OMP PARALLEL PRIVATE ( i, j, k )
405	!$OMP DO
406	#if __acc_fft_device
407	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
408	!$ACC PRESENT(f_inv, f_in)
409	#endif
410	DO i = nxl, nxr
411	DO j = nys, nyn
412	DO k = 1, nz
413	f_inv(j,i,k) = f_in(i,j,k)
414	ENDDO
415	ENDDO
416	ENDDO
417	!$OMP END PARALLEL
418
419	ENDIF
420
421	END SUBROUTINE transpose_xz
422
423
424	!------------------------------------------------------------------------------!
425	! Description:
426	! ------------
427	!> Resorting data after the transposition from y to x. The transposition itself
428	!> is carried out in transpose_yx
429	!------------------------------------------------------------------------------!
430	SUBROUTINE resort_for_yx( f_inv, f_out )
431
432
433	USE indices, &
434	ONLY: nx
435
436	USE kinds
437
438	USE transpose_indices, &
439	ONLY: nyn_x, nys_x, nzb_x, nzt_x
440
441	IMPLICIT NONE
442
443	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
444	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
445
446
447	INTEGER(iwp) :: i !<
448	INTEGER(iwp) :: j !<
449	INTEGER(iwp) :: k !<
450	!
451	!-- Rearrange indices of input array in order to make data to be send
452	!-- by MPI contiguous
453	!$OMP PARALLEL PRIVATE ( i, j, k )
454	!$OMP DO
455	#if __acc_fft_device
456	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
457	!$ACC PRESENT(f_out, f_inv)
458	#endif
459	DO i = 0, nx
460	DO k = nzb_x, nzt_x
461	DO j = nys_x, nyn_x
462	f_out(i,j,k) = f_inv(j,k,i)
463	ENDDO
464	ENDDO
465	ENDDO
466	!$OMP END PARALLEL
467
468	END SUBROUTINE resort_for_yx
469
470
471	!------------------------------------------------------------------------------!
472	! Description:
473	! ------------
474	!> Transposition of input array (f_in) from y to x. For the input array, all
475	!> elements along y reside on the same PE, while after transposition, all
476	!> elements along x reside on the same PE.
477	!------------------------------------------------------------------------------!
478	SUBROUTINE transpose_yx( f_in, f_inv )
479
480
481	USE cpulog, &
482	ONLY: cpu_log, cpu_log_nowait, log_point_s
483
484	USE indices, &
485	ONLY: nx, ny
486
487	USE kinds
488
489	USE pegrid
490
491	USE transpose_indices, &
492	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
493
494	IMPLICIT NONE
495
496	INTEGER(iwp) :: i !<
497	INTEGER(iwp) :: j !<
498	INTEGER(iwp) :: k !<
499	INTEGER(iwp) :: l !<
500	INTEGER(iwp) :: ys !<
501
502	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
503	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
504
505	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
506	#if __acc_fft_device
507	!$ACC DECLARE CREATE(work)
508	#endif
509
510
511	IF ( numprocs /= 1 ) THEN
512
513	#if defined( __parallel )
514	!
515	!-- Reorder input array for transposition
516	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
517	!$OMP DO
518	DO l = 0, pdims(2) - 1
519	ys = 0 + l * ( nyn_x - nys_x + 1 )
520	#if __acc_fft_device
521	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
522	!$ACC PRESENT(work, f_in)
523	#endif
524	DO i = nxl_y, nxr_y
525	DO k = nzb_y, nzt_y
526	DO j = ys, ys + nyn_x - nys_x
527	work(j-ys+1,k,i,l) = f_in(j,i,k)
528	ENDDO
529	ENDDO
530	ENDDO
531	ENDDO
532	!$OMP END PARALLEL
533
534	!
535	!-- Transpose array
536	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
537
538	#if __acc_fft_device
539	#ifndef __cuda_aware_mpi
540	!$ACC UPDATE HOST(work)
541	#else
542	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
543	#endif
544	#endif
545
546	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
547	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
548	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
549	comm1dy, ierr )
550
551	#if __acc_fft_device
552	#ifndef __cuda_aware_mpi
553	!$ACC UPDATE DEVICE(f_inv)
554	#else
555	!$ACC END HOST_DATA
556	#endif
557	#endif
558
559	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
560	#endif
561
562	ELSE
563
564	!
565	!-- Reorder array f_in the same way as ALLTOALL did it
566	!$OMP PARALLEL PRIVATE ( i, j, k )
567	!$OMP DO
568	#if __acc_fft_device
569	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
570	!$ACC PRESENT(f_inv, f_in)
571	#endif
572	DO i = nxl_y, nxr_y
573	DO k = nzb_y, nzt_y
574	DO j = 0, ny
575	f_inv(j,k,i) = f_in(j,i,k)
576	ENDDO
577	ENDDO
578	ENDDO
579	!$OMP END PARALLEL
580
581	ENDIF
582
583	END SUBROUTINE transpose_yx
584
585
586	!------------------------------------------------------------------------------!
587	! Description:
588	! ------------
589	!> Transposition of input array (f_in) from y to x. For the input array, all
590	!> elements along y reside on the same PE, while after transposition, all
591	!> elements along x reside on the same PE.
592	!> This is a direct transposition for arrays with indices in regular order
593	!> (k,j,i) (cf. transpose_yx).
594	!------------------------------------------------------------------------------!
595	SUBROUTINE transpose_yxd( f_in, f_out )
596
597
598	USE cpulog, &
599	ONLY: cpu_log, log_point_s
600
601	USE indices, &
602	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
603
604	USE kinds
605
606	USE pegrid
607
608	USE transpose_indices, &
609	ONLY: nyn_x, nys_x, nzb_x, nzt_x
610
611	IMPLICIT NONE
612
613	INTEGER(iwp) :: i !<
614	INTEGER(iwp) :: j !<
615	INTEGER(iwp) :: k !<
616	INTEGER(iwp) :: l !<
617	INTEGER(iwp) :: m !<
618	INTEGER(iwp) :: xs !<
619
620	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
621	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
622	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
623	REAL(wp) :: work(nnxnnynnz) !<
624	#if defined( __parallel )
625
626	!
627	!-- Rearrange indices of input array in order to make data to be send
628	!-- by MPI contiguous
629	DO k = 1, nz
630	DO j = nys, nyn
631	DO i = nxl, nxr
632	f_inv(i,k,j) = f_in(k,j,i)
633	ENDDO
634	ENDDO
635	ENDDO
636
637	!
638	!-- Transpose array
639	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
640	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
641	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
642	work(1), sendrecvcount_xy, MPI_REAL, &
643	comm1dx, ierr )
644	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
645
646	!
647	!-- Reorder transposed array
648	m = 0
649	DO l = 0, pdims(1) - 1
650	xs = 0 + l * nnx
651	DO j = nys_x, nyn_x
652	DO k = 1, nz
653	DO i = xs, xs + nnx - 1
654	m = m + 1
655	f_out(i,j,k) = work(m)
656	ENDDO
657	ENDDO
658	ENDDO
659	ENDDO
660
661	#endif
662
663	END SUBROUTINE transpose_yxd
664
665
666	!------------------------------------------------------------------------------!
667	! Description:
668	! ------------
669	!> Resorting data for the transposition from y to z. The transposition itself
670	!> is carried out in transpose_yz
671	!------------------------------------------------------------------------------!
672	SUBROUTINE resort_for_yz( f_in, f_inv )
673
674
675	USE indices, &
676	ONLY: ny
677
678	USE kinds
679
680	USE transpose_indices, &
681	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
682
683	IMPLICIT NONE
684
685	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
686	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
687
688	INTEGER(iwp) :: i !<
689	INTEGER(iwp) :: j !<
690	INTEGER(iwp) :: k !<
691
692	!
693	!-- Rearrange indices of input array in order to make data to be send
694	!-- by MPI contiguous
695	!$OMP PARALLEL PRIVATE ( i, j, k )
696	!$OMP DO
697	#if __acc_fft_device
698	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
699	!$ACC PRESENT(f_inv, f_in)
700	#endif
701	DO j = 0, ny
702	DO k = nzb_y, nzt_y
703	DO i = nxl_y, nxr_y
704	f_inv(i,k,j) = f_in(j,i,k)
705	ENDDO
706	ENDDO
707	ENDDO
708	!$OMP END PARALLEL
709
710	END SUBROUTINE resort_for_yz
711
712
713	!------------------------------------------------------------------------------!
714	! Description:
715	! ------------
716	!> Transposition of input array (f_in) from y to z. For the input array, all
717	!> elements along y reside on the same PE, while after transposition, all
718	!> elements along z reside on the same PE.
719	!------------------------------------------------------------------------------!
720	SUBROUTINE transpose_yz( f_inv, f_out )
721
722
723	USE cpulog, &
724	ONLY: cpu_log, cpu_log_nowait, log_point_s
725
726	USE indices, &
727	ONLY: ny, nz
728
729	USE kinds
730
731	USE pegrid
732
733	USE transpose_indices, &
734	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
735
736	IMPLICIT NONE
737
738	INTEGER(iwp) :: i !<
739	INTEGER(iwp) :: j !<
740	INTEGER(iwp) :: k !<
741	INTEGER(iwp) :: l !<
742	INTEGER(iwp) :: zs !<
743
744	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
745	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
746
747	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
748	#if __acc_fft_device
749	!$ACC DECLARE CREATE(work)
750	#endif
751
752
753	!
754	!-- If the PE grid is one-dimensional along y, only local reordering
755	!-- of the data is necessary and no transposition has to be done.
756	IF ( pdims(1) == 1 ) THEN
757
758	!$OMP PARALLEL PRIVATE ( i, j, k )
759	!$OMP DO
760	#if __acc_fft_device
761	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
762	!$ACC PRESENT(f_out, f_inv)
763	#endif
764	DO j = 0, ny
765	DO k = nzb_y, nzt_y
766	DO i = nxl_y, nxr_y
767	f_out(i,j,k) = f_inv(i,k,j)
768	ENDDO
769	ENDDO
770	ENDDO
771	!$OMP END PARALLEL
772
773	ELSE
774
775	#if defined( __parallel )
776	!
777	!-- Transpose array
778	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
779
780	#if __acc_fft_device
781	#ifndef __cuda_aware_mpi
782	!$ACC UPDATE HOST(f_inv)
783	#else
784	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
785	#endif
786	#endif
787
788	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
789	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
790	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
791	comm1dx, ierr )
792
793	#if __acc_fft_device
794	#ifndef __cuda_aware_mpi
795	!$ACC UPDATE DEVICE(work)
796	#else
797	!$ACC END HOST_DATA
798	#endif
799	#endif
800
801	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
802
803	!
804	!-- Reorder transposed array
805	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
806	!$OMP DO
807	DO l = 0, pdims(1) - 1
808	zs = 1 + l * ( nzt_y - nzb_y + 1 )
809	#if __acc_fft_device
810	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
811	!$ACC PRESENT(f_out, work)
812	#endif
813	DO j = nys_z, nyn_z
814	DO k = zs, zs + nzt_y - nzb_y
815	DO i = nxl_z, nxr_z
816	f_out(i,j,k) = work(i,k-zs+1,j,l)
817	ENDDO
818	ENDDO
819	ENDDO
820	ENDDO
821	!$OMP END PARALLEL
822	#endif
823
824	ENDIF
825
826	END SUBROUTINE transpose_yz
827
828
829	!------------------------------------------------------------------------------!
830	! Description:
831	! ------------
832	!> Resorting data for the transposition from z to x. The transposition itself
833	!> is carried out in transpose_zx
834	!------------------------------------------------------------------------------!
835	SUBROUTINE resort_for_zx( f_in, f_inv )
836
837
838	USE indices, &
839	ONLY: nxl, nxr, nyn, nys, nz
840
841	USE kinds
842
843	IMPLICIT NONE
844
845	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
846	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
847
848	INTEGER(iwp) :: i !<
849	INTEGER(iwp) :: j !<
850	INTEGER(iwp) :: k !<
851
852	!
853	!-- Rearrange indices of input array in order to make data to be send
854	!-- by MPI contiguous
855	!$OMP PARALLEL PRIVATE ( i, j, k )
856	!$OMP DO
857	#if __acc_fft_device
858	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
859	!$ACC PRESENT(f_in, f_inv)
860	#endif
861	DO i = nxl, nxr
862	DO k = 1,nz
863	DO j = nys, nyn
864	f_inv(j,i,k) = f_in(k,j,i)
865	ENDDO
866	ENDDO
867	ENDDO
868	!$OMP END PARALLEL
869
870	END SUBROUTINE resort_for_zx
871
872
873	!------------------------------------------------------------------------------!
874	! Description:
875	! ------------
876	!> Transposition of input array (f_in) from z to x. For the input array, all
877	!> elements along z reside on the same PE, while after transposition, all
878	!> elements along x reside on the same PE.
879	!------------------------------------------------------------------------------!
880	SUBROUTINE transpose_zx( f_inv, f_out )
881
882
883	USE cpulog, &
884	ONLY: cpu_log, cpu_log_nowait, log_point_s
885
886	USE indices, &
887	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
888
889	USE kinds
890
891	USE pegrid
892
893	USE transpose_indices, &
894	ONLY: nyn_x, nys_x, nzb_x, nzt_x
895
896	IMPLICIT NONE
897
898	INTEGER(iwp) :: i !<
899	INTEGER(iwp) :: j !<
900	INTEGER(iwp) :: k !<
901	INTEGER(iwp) :: l !<
902	INTEGER(iwp) :: xs !<
903
904	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
905	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
906
907	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
908	#if __acc_fft_device
909	!$ACC DECLARE CREATE(work)
910	#endif
911
912
913	!
914	!-- If the PE grid is one-dimensional along y, only local reordering
915	!-- of the data is necessary and no transposition has to be done.
916	IF ( pdims(1) == 1 ) THEN
917
918	!$OMP PARALLEL PRIVATE ( i, j, k )
919	!$OMP DO
920	#if __acc_fft_device
921	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
922	!$ACC PRESENT(f_out, f_inv)
923	#endif
924	DO k = 1, nz
925	DO i = nxl, nxr
926	DO j = nys, nyn
927	f_out(i,j,k) = f_inv(j,i,k)
928	ENDDO
929	ENDDO
930	ENDDO
931	!$OMP END PARALLEL
932
933	ELSE
934
935	#if defined( __parallel )
936	!
937	!-- Transpose array
938	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
939
940	#if __acc_fft_device
941	#ifndef __cuda_aware_mpi
942	!$ACC UPDATE HOST(f_inv)
943	#else
944	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
945	#endif
946	#endif
947
948	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
949	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
950	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
951	comm1dx, ierr )
952
953	#if __acc_fft_device
954	#ifndef __cuda_aware_mpi
955	!$ACC UPDATE DEVICE(work)
956	#else
957	!$ACC END HOST_DATA
958	#endif
959	#endif
960
961	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
962
963	!
964	!-- Reorder transposed array
965	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
966	!$OMP DO
967	DO l = 0, pdims(1) - 1
968	xs = 0 + l * nnx
969	#if __acc_fft_device
970	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
971	!$ACC PRESENT(f_out, work)
972	#endif
973	DO k = nzb_x, nzt_x
974	DO i = xs, xs + nnx - 1
975	DO j = nys_x, nyn_x
976	f_out(i,j,k) = work(j,i-xs+1,k,l)
977	ENDDO
978	ENDDO
979	ENDDO
980	ENDDO
981	!$OMP END PARALLEL
982	#endif
983
984	ENDIF
985
986	END SUBROUTINE transpose_zx
987
988
989	!------------------------------------------------------------------------------!
990	! Description:
991	! ------------
992	!> Resorting data after the transposition from z to y. The transposition itself
993	!> is carried out in transpose_zy
994	!------------------------------------------------------------------------------!
995	SUBROUTINE resort_for_zy( f_inv, f_out )
996
997
998	USE indices, &
999	ONLY: ny
1000
1001	USE kinds
1002
1003	USE transpose_indices, &
1004	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1005
1006	IMPLICIT NONE
1007
1008	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1009	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1010
1011
1012	INTEGER(iwp) :: i !<
1013	INTEGER(iwp) :: j !<
1014	INTEGER(iwp) :: k !<
1015
1016	!
1017	!-- Rearrange indices of input array in order to make data to be send
1018	!-- by MPI contiguous
1019	!$OMP PARALLEL PRIVATE ( i, j, k )
1020	!$OMP DO
1021	#if __acc_fft_device
1022	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1023	!$ACC PRESENT(f_out, f_inv)
1024	#endif
1025	DO j = 0, ny
1026	DO k = nzb_y, nzt_y
1027	DO i = nxl_y, nxr_y
1028	f_out(j,i,k) = f_inv(i,k,j)
1029	ENDDO
1030	ENDDO
1031	ENDDO
1032	!$OMP END PARALLEL
1033
1034	END SUBROUTINE resort_for_zy
1035
1036
1037	!------------------------------------------------------------------------------!
1038	! Description:cpu_log_nowait
1039	! ------------
1040	!> Transposition of input array (f_in) from z to y. For the input array, all
1041	!> elements along z reside on the same PE, while after transposition, all
1042	!> elements along y reside on the same PE.
1043	!------------------------------------------------------------------------------!
1044	SUBROUTINE transpose_zy( f_in, f_inv )
1045
1046
1047	USE cpulog, &
1048	ONLY: cpu_log, cpu_log_nowait, log_point_s
1049
1050	USE indices, &
1051	ONLY: ny, nz
1052
1053	USE kinds
1054
1055	USE pegrid
1056
1057	USE transpose_indices, &
1058	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1059
1060	IMPLICIT NONE
1061
1062	INTEGER(iwp) :: i !<
1063	INTEGER(iwp) :: j !<
1064	INTEGER(iwp) :: k !<
1065	INTEGER(iwp) :: l !<
1066	INTEGER(iwp) :: zs !<
1067
1068	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1069	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1070
1071	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1072	#if __acc_fft_device
1073	!$ACC DECLARE CREATE(work)
1074	#endif
1075
1076	!
1077	!-- If the PE grid is one-dimensional along y, the array has only to be
1078	!-- reordered locally and therefore no transposition has to be done.
1079	IF ( pdims(1) /= 1 ) THEN
1080
1081	#if defined( __parallel )
1082	!
1083	!-- Reorder input array for transposition
1084	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1085	!$OMP DO
1086	DO l = 0, pdims(1) - 1
1087	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1088	#if __acc_fft_device
1089	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1090	!$ACC PRESENT(work, f_in)
1091	#endif
1092	DO j = nys_z, nyn_z
1093	DO k = zs, zs + nzt_y - nzb_y
1094	DO i = nxl_z, nxr_z
1095	work(i,k-zs+1,j,l) = f_in(i,j,k)
1096	ENDDO
1097	ENDDO
1098	ENDDO
1099	ENDDO
1100	!$OMP END PARALLEL
1101
1102	!
1103	!-- Transpose array
1104	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1105
1106	#if __acc_fft_device
1107	#ifndef __cuda_aware_mpi
1108	!$ACC UPDATE HOST(work)
1109	#else
1110	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1111	#endif
1112	#endif
1113
1114	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1115	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1116	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1117	comm1dx, ierr )
1118
1119	#if __acc_fft_device
1120	#ifndef __cuda_aware_mpi
1121	!$ACC UPDATE DEVICE(f_inv)
1122	#else
1123	!$ACC END HOST_DATA
1124	#endif
1125	#endif
1126
1127	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1128	#endif
1129
1130	ELSE
1131	!
1132	!-- Reorder the array in the same way like ALLTOALL did it
1133	!$OMP PARALLEL PRIVATE ( i, j, k )
1134	!$OMP DO
1135	#if __acc_fft_device
1136	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1137	!$ACC PRESENT(f_inv, f_in)
1138	#endif
1139	DO k = nzb_y, nzt_y
1140	DO j = 0, ny
1141	DO i = nxl_y, nxr_y
1142	f_inv(i,k,j) = f_in(i,j,k)
1143	ENDDO
1144	ENDDO
1145	ENDDO
1146	!$OMP END PARALLEL
1147
1148	ENDIF
1149
1150	END SUBROUTINE transpose_zy
1151
1152
1153	!------------------------------------------------------------------------------!
1154	! Description:
1155	! ------------
1156	!> Transposition of input array (f_in) from z to y. For the input array, all
1157	!> elements along z reside on the same PE, while after transposition, all
1158	!> elements along y reside on the same PE.
1159	!> This is a direct transposition for arrays with indices in regular order
1160	!> (k,j,i) (cf. transpose_zy).
1161	!------------------------------------------------------------------------------!
1162	SUBROUTINE transpose_zyd( f_in, f_out )
1163
1164
1165	USE cpulog, &
1166	ONLY: cpu_log, log_point_s
1167
1168	USE indices, &
1169	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1170
1171	USE kinds
1172
1173	USE pegrid
1174
1175	USE transpose_indices, &
1176	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1177
1178	IMPLICIT NONE
1179
1180	INTEGER(iwp) :: i !<
1181	INTEGER(iwp) :: j !<
1182	INTEGER(iwp) :: k !<
1183	INTEGER(iwp) :: l !<
1184	INTEGER(iwp) :: m !<
1185	INTEGER(iwp) :: ys !<
1186
1187	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1188	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1189	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1190	REAL(wp) :: work(nnxnnynnz) !<
1191
1192	#if defined( __parallel )
1193
1194	!
1195	!-- Rearrange indices of input array in order to make data to be send
1196	!-- by MPI contiguous
1197	DO i = nxl, nxr
1198	DO j = nys, nyn
1199	DO k = 1, nz
1200	f_inv(j,i,k) = f_in(k,j,i)
1201	ENDDO
1202	ENDDO
1203	ENDDO
1204
1205	!
1206	!-- Move data to different array, because memory location of work1 is
1207	!-- needed further below (work1 = work2).
1208	!-- If the PE grid is one-dimensional along x, only local reordering
1209	!-- of the data is necessary and no transposition has to be done.
1210	IF ( pdims(2) == 1 ) THEN
1211	DO k = 1, nz
1212	DO i = nxl, nxr
1213	DO j = nys, nyn
1214	f_out(j,i,k) = f_inv(j,i,k)
1215	ENDDO
1216	ENDDO
1217	ENDDO
1218	RETURN
1219	ENDIF
1220
1221	!
1222	!-- Transpose array
1223	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1224	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1225	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1226	work(1), sendrecvcount_zyd, MPI_REAL, &
1227	comm1dy, ierr )
1228	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1229
1230	!
1231	!-- Reorder transposed array
1232	m = 0
1233	DO l = 0, pdims(2) - 1
1234	ys = 0 + l * nny
1235	DO k = nzb_yd, nzt_yd
1236	DO i = nxl_yd, nxr_yd
1237	DO j = ys, ys + nny - 1
1238	m = m + 1
1239	f_out(j,i,k) = work(m)
1240	ENDDO
1241	ENDDO
1242	ENDDO
1243	ENDDO
1244
1245	#endif
1246
1247	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |