Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4177

Last change on this file since 4177 was 4171, checked in by gronemeier, 5 years ago
loop reordering for performance optimization
Property svn:keywords set to `Id`
File size: 34.0 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4171 2019-08-19 17:44:09Z gronemeier $
27	! loop reordering for performance optimization
28	!
29	! 3832 2019-03-28 13:16:58Z raasch
30	! loop reordering for performance optimization
31	!
32	! 3694 2019-01-23 17:01:49Z knoop
33	! OpenACC port for SPEC
34	!
35	! 3241 2018-09-12 15:02:00Z raasch
36	! unused variables removed
37	!
38	! 2718 2018-01-02 08:49:38Z maronga
39	! Corrected "Former revisions" section
40	!
41	! 2696 2017-12-14 17:12:51Z kanani
42	! Change in file header (GPL part)
43	!
44	! 2119 2017-01-17 16:51:50Z raasch
45	!
46	! 2118 2017-01-17 16:38:49Z raasch
47	! OpenACC directives removed
48	!
49	! 2000 2016-08-20 18:09:15Z knoop
50	! Forced header and separation lines into 80 columns
51	!
52	! 1682 2015-10-07 23:56:08Z knoop
53	! Code annotations made doxygen readable
54	!
55	! 1324 2014-03-21 09:13:16Z suehring
56	! Bugfix: ONLY statement for module pegrid removed
57	!
58	! 1320 2014-03-20 08:40:49Z raasch
59	! ONLY-attribute added to USE-statements,
60	! kind-parameters added to all INTEGER and REAL declaration statements,
61	! kinds are defined in new module kinds,
62	! old module precision_kind is removed,
63	! revision history before 2012 removed,
64	! comment fields (!:) to be used for variable explanations added to
65	! all variable declaration statements
66	!
67	! 1318 2014-03-17 13:35:16Z raasch
68	! cpu_log_nowait parameter added to cpu measurements of the transpositions
69	! required for solving the Poisson equation (poisfft),
70	! module interfaces removed
71	!
72	! 1257 2013-11-08 15:18:40Z raasch
73	! openacc loop and loop vector clauses removed
74	!
75	! 1216 2013-08-26 09:31:42Z raasch
76	! re-sorting of the transposed / to be transposed arrays moved to separate
77	! routines resort_for_...
78	!
79	! 1111 2013-03-08 23:54:10Z raasch
80	! openACC directives added,
81	! resorting data from/to work changed, work got 4 dimensions instead of 1
82	!
83	! 1106 2013-03-04 05:31:38Z raasch
84	! preprocessor lines rearranged so that routines can also be used in serial
85	! (non-parallel) mode
86	!
87	! 1092 2013-02-02 11:24:22Z raasch
88	! unused variables removed
89	!
90	! 1036 2012-10-22 13:43:42Z raasch
91	! code put under GPL (PALM 3.9)
92	!
93	! 1003 2012-09-14 14:35:53Z raasch
94	! indices nxa, nya, etc. replaced by nx, ny, etc.
95	!
96	! Revision 1.1 1997/07/24 11:25:18 raasch
97	! Initial revision
98	!
99
100	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
101
102	!------------------------------------------------------------------------------!
103	! Description:
104	! ------------
105	!> Resorting data for the transposition from x to y. The transposition itself
106	!> is carried out in transpose_xy
107	!------------------------------------------------------------------------------!
108	SUBROUTINE resort_for_xy( f_in, f_inv )
109
110
111	USE indices, &
112	ONLY: nx
113
114	USE kinds
115
116	USE transpose_indices, &
117	ONLY: nyn_x, nys_x, nzb_x, nzt_x
118
119	IMPLICIT NONE
120
121	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
122	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
123
124
125	INTEGER(iwp) :: i !<
126	INTEGER(iwp) :: j !<
127	INTEGER(iwp) :: k !<
128	!
129	!-- Rearrange indices of input array in order to make data to be send
130	!-- by MPI contiguous
131	!$OMP PARALLEL PRIVATE ( i, j, k )
132	!$OMP DO
133	#if __acc_fft_device
134	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
135	!$ACC PRESENT(f_inv, f_in)
136	#endif
137	DO k = nzb_x, nzt_x
138	DO j = nys_x, nyn_x
139	DO i = 0, nx
140	f_inv(j,k,i) = f_in(i,j,k)
141	ENDDO
142	ENDDO
143	ENDDO
144	!$OMP END PARALLEL
145
146	END SUBROUTINE resort_for_xy
147
148
149	!------------------------------------------------------------------------------!
150	! Description:
151	! ------------
152	!> Transposition of input array (f_in) from x to y. For the input array, all
153	!> elements along x reside on the same PE, while after transposition, all
154	!> elements along y reside on the same PE.
155	!------------------------------------------------------------------------------!
156	SUBROUTINE transpose_xy( f_inv, f_out )
157
158
159	USE cpulog, &
160	ONLY: cpu_log, cpu_log_nowait, log_point_s
161
162	USE indices, &
163	ONLY: nx, ny
164
165	USE kinds
166
167	USE pegrid
168
169	USE transpose_indices, &
170	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
171
172	IMPLICIT NONE
173
174	INTEGER(iwp) :: i !<
175	INTEGER(iwp) :: j !<
176	INTEGER(iwp) :: k !<
177	INTEGER(iwp) :: l !<
178	INTEGER(iwp) :: ys !<
179
180	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
181	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
182
183	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
184	#if __acc_fft_device
185	!$ACC DECLARE CREATE(work)
186	#endif
187
188
189	IF ( numprocs /= 1 ) THEN
190
191	#if defined( __parallel )
192	!
193	!-- Transpose array
194	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
195
196	#if __acc_fft_device
197	#ifndef __cuda_aware_mpi
198	!$ACC UPDATE HOST(f_inv)
199	#else
200	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
201	#endif
202	#endif
203
204	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
205	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
206	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
207	comm1dy, ierr )
208
209	#if __acc_fft_device
210	#ifndef __cuda_aware_mpi
211	!$ACC UPDATE DEVICE(work)
212	#else
213	!$ACC END HOST_DATA
214	#endif
215	#endif
216
217	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
218
219	!
220	!-- Reorder transposed array
221	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
222	!$OMP DO
223	DO l = 0, pdims(2) - 1
224	ys = 0 + l * ( nyn_x - nys_x + 1 )
225	#if __acc_fft_device
226	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
227	!$ACC PRESENT(f_out, work)
228	#endif
229	DO i = nxl_y, nxr_y
230	DO k = nzb_y, nzt_y
231	DO j = ys, ys + nyn_x - nys_x
232	f_out(j,i,k) = work(j-ys+1,k,i,l)
233	ENDDO
234	ENDDO
235	ENDDO
236	ENDDO
237	!$OMP END PARALLEL
238	#endif
239
240	ELSE
241
242	!
243	!-- Reorder transposed array
244	!$OMP PARALLEL PRIVATE ( i, j, k )
245	!$OMP DO
246	#if __acc_fft_device
247	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
248	!$ACC PRESENT(f_out, f_inv)
249	#endif
250	DO k = nzb_y, nzt_y
251	DO i = nxl_y, nxr_y
252	DO j = 0, ny
253	f_out(j,i,k) = f_inv(j,k,i)
254	ENDDO
255	ENDDO
256	ENDDO
257	!$OMP END PARALLEL
258
259	ENDIF
260
261	END SUBROUTINE transpose_xy
262
263
264	!------------------------------------------------------------------------------!
265	! Description:
266	! ------------
267	!> Resorting data after the transposition from x to z. The transposition itself
268	!> is carried out in transpose_xz
269	!------------------------------------------------------------------------------!
270	SUBROUTINE resort_for_xz( f_inv, f_out )
271
272
273	USE indices, &
274	ONLY: nxl, nxr, nyn, nys, nz
275
276	USE kinds
277
278	IMPLICIT NONE
279
280	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
281	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
282
283	INTEGER(iwp) :: i !<
284	INTEGER(iwp) :: j !<
285	INTEGER(iwp) :: k !<
286	!
287	!-- Rearrange indices of input array in order to make data to be send
288	!-- by MPI contiguous.
289	!-- In case of parallel fft/transposition, scattered store is faster in
290	!-- backward direction!!!
291	!$OMP PARALLEL PRIVATE ( i, j, k )
292	!$OMP DO
293	#if __acc_fft_device
294	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
295	!$ACC PRESENT(f_out, f_inv)
296	#endif
297	DO i = nxl, nxr
298	DO j = nys, nyn
299	DO k = 1, nz
300	f_out(k,j,i) = f_inv(j,i,k)
301	ENDDO
302	ENDDO
303	ENDDO
304	!$OMP END PARALLEL
305
306	END SUBROUTINE resort_for_xz
307
308
309	!------------------------------------------------------------------------------!
310	! Description:
311	! ------------
312	!> Transposition of input array (f_in) from x to z. For the input array, all
313	!> elements along x reside on the same PE, while after transposition, all
314	!> elements along z reside on the same PE.
315	!------------------------------------------------------------------------------!
316	SUBROUTINE transpose_xz( f_in, f_inv )
317
318
319	USE cpulog, &
320	ONLY: cpu_log, cpu_log_nowait, log_point_s
321
322	USE indices, &
323	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
324
325	USE kinds
326
327	USE pegrid
328
329	USE transpose_indices, &
330	ONLY: nyn_x, nys_x, nzb_x, nzt_x
331
332	IMPLICIT NONE
333
334	INTEGER(iwp) :: i !<
335	INTEGER(iwp) :: j !<
336	INTEGER(iwp) :: k !<
337	INTEGER(iwp) :: l !<
338	INTEGER(iwp) :: xs !<
339
340	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
341	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
342
343	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
344	#if __acc_fft_device
345	!$ACC DECLARE CREATE(work)
346	#endif
347
348
349	!
350	!-- If the PE grid is one-dimensional along y, the array has only to be
351	!-- reordered locally and therefore no transposition has to be done.
352	IF ( pdims(1) /= 1 ) THEN
353
354	#if defined( __parallel )
355	!
356	!-- Reorder input array for transposition
357	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
358	!$OMP DO
359	DO l = 0, pdims(1) - 1
360	xs = 0 + l * nnx
361	#if __acc_fft_device
362	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
363	!$ACC PRESENT(work, f_in)
364	#endif
365	DO k = nzb_x, nzt_x
366	DO i = xs, xs + nnx - 1
367	DO j = nys_x, nyn_x
368	work(j,i-xs+1,k,l) = f_in(i,j,k)
369	ENDDO
370	ENDDO
371	ENDDO
372	ENDDO
373	!$OMP END PARALLEL
374
375	!
376	!-- Transpose array
377	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
378
379	#if __acc_fft_device
380	#ifndef __cuda_aware_mpi
381	!$ACC UPDATE HOST(work)
382	#else
383	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
384	#endif
385	#endif
386
387	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
388	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
389	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
390	comm1dx, ierr )
391
392	#if __acc_fft_device
393	#ifndef __cuda_aware_mpi
394	!$ACC UPDATE DEVICE(f_inv)
395	#else
396	!$ACC END HOST_DATA
397	#endif
398	#endif
399
400	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
401	#endif
402
403	ELSE
404
405	!
406	!-- Reorder the array in a way that the z index is in first position
407	!$OMP PARALLEL PRIVATE ( i, j, k )
408	!$OMP DO
409	#if __acc_fft_device
410	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
411	!$ACC PRESENT(f_inv, f_in)
412	#endif
413	DO i = nxl, nxr
414	DO j = nys, nyn
415	DO k = 1, nz
416	f_inv(j,i,k) = f_in(i,j,k)
417	ENDDO
418	ENDDO
419	ENDDO
420	!$OMP END PARALLEL
421
422	ENDIF
423
424	END SUBROUTINE transpose_xz
425
426
427	!------------------------------------------------------------------------------!
428	! Description:
429	! ------------
430	!> Resorting data after the transposition from y to x. The transposition itself
431	!> is carried out in transpose_yx
432	!------------------------------------------------------------------------------!
433	SUBROUTINE resort_for_yx( f_inv, f_out )
434
435
436	USE indices, &
437	ONLY: nx
438
439	USE kinds
440
441	USE transpose_indices, &
442	ONLY: nyn_x, nys_x, nzb_x, nzt_x
443
444	IMPLICIT NONE
445
446	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
447	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
448
449
450	INTEGER(iwp) :: i !<
451	INTEGER(iwp) :: j !<
452	INTEGER(iwp) :: k !<
453	!
454	!-- Rearrange indices of input array in order to make data to be send
455	!-- by MPI contiguous
456	!$OMP PARALLEL PRIVATE ( i, j, k )
457	!$OMP DO
458	#if __acc_fft_device
459	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
460	!$ACC PRESENT(f_out, f_inv)
461	#endif
462	DO k = nzb_x, nzt_x
463	DO j = nys_x, nyn_x
464	DO i = 0, nx
465	f_out(i,j,k) = f_inv(j,k,i)
466	ENDDO
467	ENDDO
468	ENDDO
469	!$OMP END PARALLEL
470
471	END SUBROUTINE resort_for_yx
472
473
474	!------------------------------------------------------------------------------!
475	! Description:
476	! ------------
477	!> Transposition of input array (f_in) from y to x. For the input array, all
478	!> elements along y reside on the same PE, while after transposition, all
479	!> elements along x reside on the same PE.
480	!------------------------------------------------------------------------------!
481	SUBROUTINE transpose_yx( f_in, f_inv )
482
483
484	USE cpulog, &
485	ONLY: cpu_log, cpu_log_nowait, log_point_s
486
487	USE indices, &
488	ONLY: nx, ny
489
490	USE kinds
491
492	USE pegrid
493
494	USE transpose_indices, &
495	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
496
497	IMPLICIT NONE
498
499	INTEGER(iwp) :: i !<
500	INTEGER(iwp) :: j !<
501	INTEGER(iwp) :: k !<
502	INTEGER(iwp) :: l !<
503	INTEGER(iwp) :: ys !<
504
505	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
506	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
507
508	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
509	#if __acc_fft_device
510	!$ACC DECLARE CREATE(work)
511	#endif
512
513
514	IF ( numprocs /= 1 ) THEN
515
516	#if defined( __parallel )
517	!
518	!-- Reorder input array for transposition
519	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
520	!$OMP DO
521	DO l = 0, pdims(2) - 1
522	ys = 0 + l * ( nyn_x - nys_x + 1 )
523	#if __acc_fft_device
524	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
525	!$ACC PRESENT(work, f_in)
526	#endif
527	DO i = nxl_y, nxr_y
528	DO k = nzb_y, nzt_y
529	DO j = ys, ys + nyn_x - nys_x
530	work(j-ys+1,k,i,l) = f_in(j,i,k)
531	ENDDO
532	ENDDO
533	ENDDO
534	ENDDO
535	!$OMP END PARALLEL
536
537	!
538	!-- Transpose array
539	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
540
541	#if __acc_fft_device
542	#ifndef __cuda_aware_mpi
543	!$ACC UPDATE HOST(work)
544	#else
545	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
546	#endif
547	#endif
548
549	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
550	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
551	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
552	comm1dy, ierr )
553
554	#if __acc_fft_device
555	#ifndef __cuda_aware_mpi
556	!$ACC UPDATE DEVICE(f_inv)
557	#else
558	!$ACC END HOST_DATA
559	#endif
560	#endif
561
562	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
563	#endif
564
565	ELSE
566
567	!
568	!-- Reorder array f_in the same way as ALLTOALL did it
569	!$OMP PARALLEL PRIVATE ( i, j, k )
570	!$OMP DO
571	#if __acc_fft_device
572	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
573	!$ACC PRESENT(f_inv, f_in)
574	#endif
575	DO i = nxl_y, nxr_y
576	DO k = nzb_y, nzt_y
577	DO j = 0, ny
578	f_inv(j,k,i) = f_in(j,i,k)
579	ENDDO
580	ENDDO
581	ENDDO
582	!$OMP END PARALLEL
583
584	ENDIF
585
586	END SUBROUTINE transpose_yx
587
588
589	!------------------------------------------------------------------------------!
590	! Description:
591	! ------------
592	!> Transposition of input array (f_in) from y to x. For the input array, all
593	!> elements along y reside on the same PE, while after transposition, all
594	!> elements along x reside on the same PE.
595	!> This is a direct transposition for arrays with indices in regular order
596	!> (k,j,i) (cf. transpose_yx).
597	!------------------------------------------------------------------------------!
598	SUBROUTINE transpose_yxd( f_in, f_out )
599
600
601	USE cpulog, &
602	ONLY: cpu_log, log_point_s
603
604	USE indices, &
605	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
606
607	USE kinds
608
609	USE pegrid
610
611	USE transpose_indices, &
612	ONLY: nyn_x, nys_x, nzb_x, nzt_x
613
614	IMPLICIT NONE
615
616	INTEGER(iwp) :: i !<
617	INTEGER(iwp) :: j !<
618	INTEGER(iwp) :: k !<
619	INTEGER(iwp) :: l !<
620	INTEGER(iwp) :: m !<
621	INTEGER(iwp) :: xs !<
622
623	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
624	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
625	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
626	REAL(wp) :: work(nnxnnynnz) !<
627	#if defined( __parallel )
628
629	!
630	!-- Rearrange indices of input array in order to make data to be send
631	!-- by MPI contiguous
632	DO k = 1, nz
633	DO j = nys, nyn
634	DO i = nxl, nxr
635	f_inv(i,k,j) = f_in(k,j,i)
636	ENDDO
637	ENDDO
638	ENDDO
639
640	!
641	!-- Transpose array
642	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
643	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
644	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
645	work(1), sendrecvcount_xy, MPI_REAL, &
646	comm1dx, ierr )
647	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
648
649	!
650	!-- Reorder transposed array
651	m = 0
652	DO l = 0, pdims(1) - 1
653	xs = 0 + l * nnx
654	DO j = nys_x, nyn_x
655	DO k = 1, nz
656	DO i = xs, xs + nnx - 1
657	m = m + 1
658	f_out(i,j,k) = work(m)
659	ENDDO
660	ENDDO
661	ENDDO
662	ENDDO
663
664	#endif
665
666	END SUBROUTINE transpose_yxd
667
668
669	!------------------------------------------------------------------------------!
670	! Description:
671	! ------------
672	!> Resorting data for the transposition from y to z. The transposition itself
673	!> is carried out in transpose_yz
674	!------------------------------------------------------------------------------!
675	SUBROUTINE resort_for_yz( f_in, f_inv )
676
677
678	USE indices, &
679	ONLY: ny
680
681	USE kinds
682
683	USE transpose_indices, &
684	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
685
686	IMPLICIT NONE
687
688	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
689	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
690
691	INTEGER(iwp) :: i !<
692	INTEGER(iwp) :: j !<
693	INTEGER(iwp) :: k !<
694
695	!
696	!-- Rearrange indices of input array in order to make data to be send
697	!-- by MPI contiguous
698	!$OMP PARALLEL PRIVATE ( i, j, k )
699	!$OMP DO
700	#if __acc_fft_device
701	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
702	!$ACC PRESENT(f_inv, f_in)
703	#endif
704	DO k = nzb_y, nzt_y
705	DO i = nxl_y, nxr_y
706	DO j = 0, ny
707	f_inv(i,k,j) = f_in(j,i,k)
708	ENDDO
709	ENDDO
710	ENDDO
711	!$OMP END PARALLEL
712
713	END SUBROUTINE resort_for_yz
714
715
716	!------------------------------------------------------------------------------!
717	! Description:
718	! ------------
719	!> Transposition of input array (f_in) from y to z. For the input array, all
720	!> elements along y reside on the same PE, while after transposition, all
721	!> elements along z reside on the same PE.
722	!------------------------------------------------------------------------------!
723	SUBROUTINE transpose_yz( f_inv, f_out )
724
725
726	USE cpulog, &
727	ONLY: cpu_log, cpu_log_nowait, log_point_s
728
729	USE indices, &
730	ONLY: ny, nz
731
732	USE kinds
733
734	USE pegrid
735
736	USE transpose_indices, &
737	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
738
739	IMPLICIT NONE
740
741	INTEGER(iwp) :: i !<
742	INTEGER(iwp) :: j !<
743	INTEGER(iwp) :: k !<
744	INTEGER(iwp) :: l !<
745	INTEGER(iwp) :: zs !<
746
747	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
748	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
749
750	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
751	#if __acc_fft_device
752	!$ACC DECLARE CREATE(work)
753	#endif
754
755
756	!
757	!-- If the PE grid is one-dimensional along y, only local reordering
758	!-- of the data is necessary and no transposition has to be done.
759	IF ( pdims(1) == 1 ) THEN
760
761	!$OMP PARALLEL PRIVATE ( i, j, k )
762	!$OMP DO
763	#if __acc_fft_device
764	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
765	!$ACC PRESENT(f_out, f_inv)
766	#endif
767	DO j = 0, ny
768	DO k = nzb_y, nzt_y
769	DO i = nxl_y, nxr_y
770	f_out(i,j,k) = f_inv(i,k,j)
771	ENDDO
772	ENDDO
773	ENDDO
774	!$OMP END PARALLEL
775
776	ELSE
777
778	#if defined( __parallel )
779	!
780	!-- Transpose array
781	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
782
783	#if __acc_fft_device
784	#ifndef __cuda_aware_mpi
785	!$ACC UPDATE HOST(f_inv)
786	#else
787	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
788	#endif
789	#endif
790
791	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
792	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
793	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
794	comm1dx, ierr )
795
796	#if __acc_fft_device
797	#ifndef __cuda_aware_mpi
798	!$ACC UPDATE DEVICE(work)
799	#else
800	!$ACC END HOST_DATA
801	#endif
802	#endif
803
804	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
805
806	!
807	!-- Reorder transposed array
808	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
809	!$OMP DO
810	DO l = 0, pdims(1) - 1
811	zs = 1 + l * ( nzt_y - nzb_y + 1 )
812	#if __acc_fft_device
813	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
814	!$ACC PRESENT(f_out, work)
815	#endif
816	DO j = nys_z, nyn_z
817	DO k = zs, zs + nzt_y - nzb_y
818	DO i = nxl_z, nxr_z
819	f_out(i,j,k) = work(i,k-zs+1,j,l)
820	ENDDO
821	ENDDO
822	ENDDO
823	ENDDO
824	!$OMP END PARALLEL
825	#endif
826
827	ENDIF
828
829	END SUBROUTINE transpose_yz
830
831
832	!------------------------------------------------------------------------------!
833	! Description:
834	! ------------
835	!> Resorting data for the transposition from z to x. The transposition itself
836	!> is carried out in transpose_zx
837	!------------------------------------------------------------------------------!
838	SUBROUTINE resort_for_zx( f_in, f_inv )
839
840
841	USE indices, &
842	ONLY: nxl, nxr, nyn, nys, nz
843
844	USE kinds
845
846	IMPLICIT NONE
847
848	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
849	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
850
851	INTEGER(iwp) :: i !<
852	INTEGER(iwp) :: j !<
853	INTEGER(iwp) :: k !<
854
855	!
856	!-- Rearrange indices of input array in order to make data to be send
857	!-- by MPI contiguous
858	!$OMP PARALLEL PRIVATE ( i, j, k )
859	!$OMP DO
860	#if __acc_fft_device
861	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
862	!$ACC PRESENT(f_in, f_inv)
863	#endif
864	DO i = nxl, nxr
865	DO j = nys, nyn
866	DO k = 1,nz
867	f_inv(j,i,k) = f_in(k,j,i)
868	ENDDO
869	ENDDO
870	ENDDO
871	!$OMP END PARALLEL
872
873	END SUBROUTINE resort_for_zx
874
875
876	!------------------------------------------------------------------------------!
877	! Description:
878	! ------------
879	!> Transposition of input array (f_in) from z to x. For the input array, all
880	!> elements along z reside on the same PE, while after transposition, all
881	!> elements along x reside on the same PE.
882	!------------------------------------------------------------------------------!
883	SUBROUTINE transpose_zx( f_inv, f_out )
884
885
886	USE cpulog, &
887	ONLY: cpu_log, cpu_log_nowait, log_point_s
888
889	USE indices, &
890	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
891
892	USE kinds
893
894	USE pegrid
895
896	USE transpose_indices, &
897	ONLY: nyn_x, nys_x, nzb_x, nzt_x
898
899	IMPLICIT NONE
900
901	INTEGER(iwp) :: i !<
902	INTEGER(iwp) :: j !<
903	INTEGER(iwp) :: k !<
904	INTEGER(iwp) :: l !<
905	INTEGER(iwp) :: xs !<
906
907	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
908	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
909
910	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
911	#if __acc_fft_device
912	!$ACC DECLARE CREATE(work)
913	#endif
914
915
916	!
917	!-- If the PE grid is one-dimensional along y, only local reordering
918	!-- of the data is necessary and no transposition has to be done.
919	IF ( pdims(1) == 1 ) THEN
920
921	!$OMP PARALLEL PRIVATE ( i, j, k )
922	!$OMP DO
923	#if __acc_fft_device
924	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
925	!$ACC PRESENT(f_out, f_inv)
926	#endif
927	DO k = 1, nz
928	DO i = nxl, nxr
929	DO j = nys, nyn
930	f_out(i,j,k) = f_inv(j,i,k)
931	ENDDO
932	ENDDO
933	ENDDO
934	!$OMP END PARALLEL
935
936	ELSE
937
938	#if defined( __parallel )
939	!
940	!-- Transpose array
941	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
942
943	#if __acc_fft_device
944	#ifndef __cuda_aware_mpi
945	!$ACC UPDATE HOST(f_inv)
946	#else
947	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
948	#endif
949	#endif
950
951	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
952	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
953	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
954	comm1dx, ierr )
955
956	#if __acc_fft_device
957	#ifndef __cuda_aware_mpi
958	!$ACC UPDATE DEVICE(work)
959	#else
960	!$ACC END HOST_DATA
961	#endif
962	#endif
963
964	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
965
966	!
967	!-- Reorder transposed array
968	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
969	!$OMP DO
970	DO l = 0, pdims(1) - 1
971	xs = 0 + l * nnx
972	#if __acc_fft_device
973	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
974	!$ACC PRESENT(f_out, work)
975	#endif
976	DO k = nzb_x, nzt_x
977	DO i = xs, xs + nnx - 1
978	DO j = nys_x, nyn_x
979	f_out(i,j,k) = work(j,i-xs+1,k,l)
980	ENDDO
981	ENDDO
982	ENDDO
983	ENDDO
984	!$OMP END PARALLEL
985	#endif
986
987	ENDIF
988
989	END SUBROUTINE transpose_zx
990
991
992	!------------------------------------------------------------------------------!
993	! Description:
994	! ------------
995	!> Resorting data after the transposition from z to y. The transposition itself
996	!> is carried out in transpose_zy
997	!------------------------------------------------------------------------------!
998	SUBROUTINE resort_for_zy( f_inv, f_out )
999
1000
1001	USE indices, &
1002	ONLY: ny
1003
1004	USE kinds
1005
1006	USE transpose_indices, &
1007	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1008
1009	IMPLICIT NONE
1010
1011	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1012	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1013
1014
1015	INTEGER(iwp) :: i !<
1016	INTEGER(iwp) :: j !<
1017	INTEGER(iwp) :: k !<
1018
1019	!
1020	!-- Rearrange indices of input array in order to make data to be send
1021	!-- by MPI contiguous
1022	!$OMP PARALLEL PRIVATE ( i, j, k )
1023	!$OMP DO
1024	#if __acc_fft_device
1025	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1026	!$ACC PRESENT(f_out, f_inv)
1027	#endif
1028	DO k = nzb_y, nzt_y
1029	DO i = nxl_y, nxr_y
1030	DO j = 0, ny
1031	f_out(j,i,k) = f_inv(i,k,j)
1032	ENDDO
1033	ENDDO
1034	ENDDO
1035	!$OMP END PARALLEL
1036
1037	END SUBROUTINE resort_for_zy
1038
1039
1040	!------------------------------------------------------------------------------!
1041	! Description:cpu_log_nowait
1042	! ------------
1043	!> Transposition of input array (f_in) from z to y. For the input array, all
1044	!> elements along z reside on the same PE, while after transposition, all
1045	!> elements along y reside on the same PE.
1046	!------------------------------------------------------------------------------!
1047	SUBROUTINE transpose_zy( f_in, f_inv )
1048
1049
1050	USE cpulog, &
1051	ONLY: cpu_log, cpu_log_nowait, log_point_s
1052
1053	USE indices, &
1054	ONLY: ny, nz
1055
1056	USE kinds
1057
1058	USE pegrid
1059
1060	USE transpose_indices, &
1061	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1062
1063	IMPLICIT NONE
1064
1065	INTEGER(iwp) :: i !<
1066	INTEGER(iwp) :: j !<
1067	INTEGER(iwp) :: k !<
1068	INTEGER(iwp) :: l !<
1069	INTEGER(iwp) :: zs !<
1070
1071	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1072	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1073
1074	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1075	#if __acc_fft_device
1076	!$ACC DECLARE CREATE(work)
1077	#endif
1078
1079	!
1080	!-- If the PE grid is one-dimensional along y, the array has only to be
1081	!-- reordered locally and therefore no transposition has to be done.
1082	IF ( pdims(1) /= 1 ) THEN
1083
1084	#if defined( __parallel )
1085	!
1086	!-- Reorder input array for transposition
1087	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1088	!$OMP DO
1089	DO l = 0, pdims(1) - 1
1090	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1091	#if __acc_fft_device
1092	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1093	!$ACC PRESENT(work, f_in)
1094	#endif
1095	DO j = nys_z, nyn_z
1096	DO k = zs, zs + nzt_y - nzb_y
1097	DO i = nxl_z, nxr_z
1098	work(i,k-zs+1,j,l) = f_in(i,j,k)
1099	ENDDO
1100	ENDDO
1101	ENDDO
1102	ENDDO
1103	!$OMP END PARALLEL
1104
1105	!
1106	!-- Transpose array
1107	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1108
1109	#if __acc_fft_device
1110	#ifndef __cuda_aware_mpi
1111	!$ACC UPDATE HOST(work)
1112	#else
1113	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1114	#endif
1115	#endif
1116
1117	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1118	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1119	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1120	comm1dx, ierr )
1121
1122	#if __acc_fft_device
1123	#ifndef __cuda_aware_mpi
1124	!$ACC UPDATE DEVICE(f_inv)
1125	#else
1126	!$ACC END HOST_DATA
1127	#endif
1128	#endif
1129
1130	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1131	#endif
1132
1133	ELSE
1134	!
1135	!-- Reorder the array in the same way like ALLTOALL did it
1136	!$OMP PARALLEL PRIVATE ( i, j, k )
1137	!$OMP DO
1138	#if __acc_fft_device
1139	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1140	!$ACC PRESENT(f_inv, f_in)
1141	#endif
1142	DO k = nzb_y, nzt_y
1143	DO j = 0, ny
1144	DO i = nxl_y, nxr_y
1145	f_inv(i,k,j) = f_in(i,j,k)
1146	ENDDO
1147	ENDDO
1148	ENDDO
1149	!$OMP END PARALLEL
1150
1151	ENDIF
1152
1153	END SUBROUTINE transpose_zy
1154
1155
1156	!------------------------------------------------------------------------------!
1157	! Description:
1158	! ------------
1159	!> Transposition of input array (f_in) from z to y. For the input array, all
1160	!> elements along z reside on the same PE, while after transposition, all
1161	!> elements along y reside on the same PE.
1162	!> This is a direct transposition for arrays with indices in regular order
1163	!> (k,j,i) (cf. transpose_zy).
1164	!------------------------------------------------------------------------------!
1165	SUBROUTINE transpose_zyd( f_in, f_out )
1166
1167
1168	USE cpulog, &
1169	ONLY: cpu_log, log_point_s
1170
1171	USE indices, &
1172	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1173
1174	USE kinds
1175
1176	USE pegrid
1177
1178	USE transpose_indices, &
1179	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1180
1181	IMPLICIT NONE
1182
1183	INTEGER(iwp) :: i !<
1184	INTEGER(iwp) :: j !<
1185	INTEGER(iwp) :: k !<
1186	INTEGER(iwp) :: l !<
1187	INTEGER(iwp) :: m !<
1188	INTEGER(iwp) :: ys !<
1189
1190	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1191	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1192	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1193	REAL(wp) :: work(nnxnnynnz) !<
1194
1195	#if defined( __parallel )
1196
1197	!
1198	!-- Rearrange indices of input array in order to make data to be send
1199	!-- by MPI contiguous
1200	DO i = nxl, nxr
1201	DO j = nys, nyn
1202	DO k = 1, nz
1203	f_inv(j,i,k) = f_in(k,j,i)
1204	ENDDO
1205	ENDDO
1206	ENDDO
1207
1208	!
1209	!-- Move data to different array, because memory location of work1 is
1210	!-- needed further below (work1 = work2).
1211	!-- If the PE grid is one-dimensional along x, only local reordering
1212	!-- of the data is necessary and no transposition has to be done.
1213	IF ( pdims(2) == 1 ) THEN
1214	DO k = 1, nz
1215	DO i = nxl, nxr
1216	DO j = nys, nyn
1217	f_out(j,i,k) = f_inv(j,i,k)
1218	ENDDO
1219	ENDDO
1220	ENDDO
1221	RETURN
1222	ENDIF
1223
1224	!
1225	!-- Transpose array
1226	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1227	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1228	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1229	work(1), sendrecvcount_zyd, MPI_REAL, &
1230	comm1dy, ierr )
1231	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1232
1233	!
1234	!-- Reorder transposed array
1235	m = 0
1236	DO l = 0, pdims(2) - 1
1237	ys = 0 + l * nny
1238	DO k = nzb_yd, nzt_yd
1239	DO i = nxl_yd, nxr_yd
1240	DO j = ys, ys + nny - 1
1241	m = m + 1
1242	f_out(j,i,k) = work(m)
1243	ENDDO
1244	ENDDO
1245	ENDDO
1246	ENDDO
1247
1248	#endif
1249
1250	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |