Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3690

Last change on this file since 3690 was 3690, checked in by knoop, 6 years ago
Enabled OpenACC usage without using the cudaFFT library. Added respective palmtest build configuration and testcase.
Property svn:keywords set to `Id`
File size: 33.9 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 3690 2019-01-22 22:56:42Z knoop $
27	! OpenACC port for SPEC
28	!
29	! 3241 2018-09-12 15:02:00Z raasch
30	! unused variables removed
31	!
32	! 2718 2018-01-02 08:49:38Z maronga
33	! Corrected "Former revisions" section
34	!
35	! 2696 2017-12-14 17:12:51Z kanani
36	! Change in file header (GPL part)
37	!
38	! 2119 2017-01-17 16:51:50Z raasch
39	!
40	! 2118 2017-01-17 16:38:49Z raasch
41	! OpenACC directives removed
42	!
43	! 2000 2016-08-20 18:09:15Z knoop
44	! Forced header and separation lines into 80 columns
45	!
46	! 1682 2015-10-07 23:56:08Z knoop
47	! Code annotations made doxygen readable
48	!
49	! 1324 2014-03-21 09:13:16Z suehring
50	! Bugfix: ONLY statement for module pegrid removed
51	!
52	! 1320 2014-03-20 08:40:49Z raasch
53	! ONLY-attribute added to USE-statements,
54	! kind-parameters added to all INTEGER and REAL declaration statements,
55	! kinds are defined in new module kinds,
56	! old module precision_kind is removed,
57	! revision history before 2012 removed,
58	! comment fields (!:) to be used for variable explanations added to
59	! all variable declaration statements
60	!
61	! 1318 2014-03-17 13:35:16Z raasch
62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
63	! required for solving the Poisson equation (poisfft),
64	! module interfaces removed
65	!
66	! 1257 2013-11-08 15:18:40Z raasch
67	! openacc loop and loop vector clauses removed
68	!
69	! 1216 2013-08-26 09:31:42Z raasch
70	! re-sorting of the transposed / to be transposed arrays moved to separate
71	! routines resort_for_...
72	!
73	! 1111 2013-03-08 23:54:10Z raasch
74	! openACC directives added,
75	! resorting data from/to work changed, work got 4 dimensions instead of 1
76	!
77	! 1106 2013-03-04 05:31:38Z raasch
78	! preprocessor lines rearranged so that routines can also be used in serial
79	! (non-parallel) mode
80	!
81	! 1092 2013-02-02 11:24:22Z raasch
82	! unused variables removed
83	!
84	! 1036 2012-10-22 13:43:42Z raasch
85	! code put under GPL (PALM 3.9)
86	!
87	! 1003 2012-09-14 14:35:53Z raasch
88	! indices nxa, nya, etc. replaced by nx, ny, etc.
89	!
90	! Revision 1.1 1997/07/24 11:25:18 raasch
91	! Initial revision
92	!
93
94	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
95
96	!------------------------------------------------------------------------------!
97	! Description:
98	! ------------
99	!> Resorting data for the transposition from x to y. The transposition itself
100	!> is carried out in transpose_xy
101	!------------------------------------------------------------------------------!
102	SUBROUTINE resort_for_xy( f_in, f_inv )
103
104
105	USE indices, &
106	ONLY: nx
107
108	USE kinds
109
110	USE transpose_indices, &
111	ONLY: nyn_x, nys_x, nzb_x, nzt_x
112
113	IMPLICIT NONE
114
115	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
116	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
117
118
119	INTEGER(iwp) :: i !<
120	INTEGER(iwp) :: j !<
121	INTEGER(iwp) :: k !<
122	!
123	!-- Rearrange indices of input array in order to make data to be send
124	!-- by MPI contiguous
125	!$OMP PARALLEL PRIVATE ( i, j, k )
126	!$OMP DO
127	#if __acc_fft_device
128	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
129	!$ACC PRESENT(f_inv, f_in)
130	#endif
131	DO i = 0, nx
132	DO k = nzb_x, nzt_x
133	DO j = nys_x, nyn_x
134	f_inv(j,k,i) = f_in(i,j,k)
135	ENDDO
136	ENDDO
137	ENDDO
138	!$OMP END PARALLEL
139
140	END SUBROUTINE resort_for_xy
141
142
143	!------------------------------------------------------------------------------!
144	! Description:
145	! ------------
146	!> Transposition of input array (f_in) from x to y. For the input array, all
147	!> elements along x reside on the same PE, while after transposition, all
148	!> elements along y reside on the same PE.
149	!------------------------------------------------------------------------------!
150	SUBROUTINE transpose_xy( f_inv, f_out )
151
152
153	USE cpulog, &
154	ONLY: cpu_log, cpu_log_nowait, log_point_s
155
156	USE indices, &
157	ONLY: nx, ny
158
159	USE kinds
160
161	USE pegrid
162
163	USE transpose_indices, &
164	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
165
166	IMPLICIT NONE
167
168	INTEGER(iwp) :: i !<
169	INTEGER(iwp) :: j !<
170	INTEGER(iwp) :: k !<
171	INTEGER(iwp) :: l !<
172	INTEGER(iwp) :: ys !<
173
174	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
175	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
176
177	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
178	#if __acc_fft_device
179	!$ACC DECLARE CREATE(work)
180	#endif
181
182
183	IF ( numprocs /= 1 ) THEN
184
185	#if defined( __parallel )
186	!
187	!-- Transpose array
188	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
189
190	#if __acc_fft_device
191	#ifndef __cuda_aware_mpi
192	!$ACC UPDATE HOST(f_inv)
193	#else
194	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
195	#endif
196	#endif
197
198	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
199	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
200	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
201	comm1dy, ierr )
202
203	#if __acc_fft_device
204	#ifndef __cuda_aware_mpi
205	!$ACC UPDATE DEVICE(work)
206	#else
207	!$ACC END HOST_DATA
208	#endif
209	#endif
210
211	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
212
213	!
214	!-- Reorder transposed array
215	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
216	!$OMP DO
217	DO l = 0, pdims(2) - 1
218	ys = 0 + l * ( nyn_x - nys_x + 1 )
219	#if __acc_fft_device
220	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
221	!$ACC PRESENT(f_out, work)
222	#endif
223	DO i = nxl_y, nxr_y
224	DO k = nzb_y, nzt_y
225	DO j = ys, ys + nyn_x - nys_x
226	f_out(j,i,k) = work(j-ys+1,k,i,l)
227	ENDDO
228	ENDDO
229	ENDDO
230	ENDDO
231	!$OMP END PARALLEL
232	#endif
233
234	ELSE
235
236	!
237	!-- Reorder transposed array
238	!$OMP PARALLEL PRIVATE ( i, j, k )
239	!$OMP DO
240	#if __acc_fft_device
241	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
242	!$ACC PRESENT(f_out, f_inv)
243	#endif
244	DO k = nzb_y, nzt_y
245	DO i = nxl_y, nxr_y
246	DO j = 0, ny
247	f_out(j,i,k) = f_inv(j,k,i)
248	ENDDO
249	ENDDO
250	ENDDO
251	!$OMP END PARALLEL
252
253	ENDIF
254
255	END SUBROUTINE transpose_xy
256
257
258	!------------------------------------------------------------------------------!
259	! Description:
260	! ------------
261	!> Resorting data after the transposition from x to z. The transposition itself
262	!> is carried out in transpose_xz
263	!------------------------------------------------------------------------------!
264	SUBROUTINE resort_for_xz( f_inv, f_out )
265
266
267	USE indices, &
268	ONLY: nxl, nxr, nyn, nys, nz
269
270	USE kinds
271
272	IMPLICIT NONE
273
274	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
275	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
276
277	INTEGER(iwp) :: i !<
278	INTEGER(iwp) :: j !<
279	INTEGER(iwp) :: k !<
280	!
281	!-- Rearrange indices of input array in order to make data to be send
282	!-- by MPI contiguous.
283	!-- In case of parallel fft/transposition, scattered store is faster in
284	!-- backward direction!!!
285	!$OMP PARALLEL PRIVATE ( i, j, k )
286	!$OMP DO
287	#if __acc_fft_device
288	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
289	!$ACC PRESENT(f_out, f_inv)
290	#endif
291	DO k = 1, nz
292	DO i = nxl, nxr
293	DO j = nys, nyn
294	f_out(k,j,i) = f_inv(j,i,k)
295	ENDDO
296	ENDDO
297	ENDDO
298	!$OMP END PARALLEL
299
300	END SUBROUTINE resort_for_xz
301
302
303	!------------------------------------------------------------------------------!
304	! Description:
305	! ------------
306	!> Transposition of input array (f_in) from x to z. For the input array, all
307	!> elements along x reside on the same PE, while after transposition, all
308	!> elements along z reside on the same PE.
309	!------------------------------------------------------------------------------!
310	SUBROUTINE transpose_xz( f_in, f_inv )
311
312
313	USE cpulog, &
314	ONLY: cpu_log, cpu_log_nowait, log_point_s
315
316	USE indices, &
317	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
318
319	USE kinds
320
321	USE pegrid
322
323	USE transpose_indices, &
324	ONLY: nyn_x, nys_x, nzb_x, nzt_x
325
326	IMPLICIT NONE
327
328	INTEGER(iwp) :: i !<
329	INTEGER(iwp) :: j !<
330	INTEGER(iwp) :: k !<
331	INTEGER(iwp) :: l !<
332	INTEGER(iwp) :: xs !<
333
334	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
335	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
336
337	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
338	#if __acc_fft_device
339	!$ACC DECLARE CREATE(work)
340	#endif
341
342
343	!
344	!-- If the PE grid is one-dimensional along y, the array has only to be
345	!-- reordered locally and therefore no transposition has to be done.
346	IF ( pdims(1) /= 1 ) THEN
347
348	#if defined( __parallel )
349	!
350	!-- Reorder input array for transposition
351	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
352	!$OMP DO
353	DO l = 0, pdims(1) - 1
354	xs = 0 + l * nnx
355	#if __acc_fft_device
356	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
357	!$ACC PRESENT(work, f_in)
358	#endif
359	DO k = nzb_x, nzt_x
360	DO i = xs, xs + nnx - 1
361	DO j = nys_x, nyn_x
362	work(j,i-xs+1,k,l) = f_in(i,j,k)
363	ENDDO
364	ENDDO
365	ENDDO
366	ENDDO
367	!$OMP END PARALLEL
368
369	!
370	!-- Transpose array
371	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
372
373	#if __acc_fft_device
374	#ifndef __cuda_aware_mpi
375	!$ACC UPDATE HOST(work)
376	#else
377	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
378	#endif
379	#endif
380
381	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
382	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
383	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
384	comm1dx, ierr )
385
386	#if __acc_fft_device
387	#ifndef __cuda_aware_mpi
388	!$ACC UPDATE DEVICE(f_inv)
389	#else
390	!$ACC END HOST_DATA
391	#endif
392	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
393	#endif
394	#endif
395
396	ELSE
397
398	!
399	!-- Reorder the array in a way that the z index is in first position
400	!$OMP PARALLEL PRIVATE ( i, j, k )
401	!$OMP DO
402	#if __acc_fft_device
403	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
404	!$ACC PRESENT(f_inv, f_in)
405	#endif
406	DO i = nxl, nxr
407	DO j = nys, nyn
408	DO k = 1, nz
409	f_inv(j,i,k) = f_in(i,j,k)
410	ENDDO
411	ENDDO
412	ENDDO
413	!$OMP END PARALLEL
414
415	ENDIF
416
417	END SUBROUTINE transpose_xz
418
419
420	!------------------------------------------------------------------------------!
421	! Description:
422	! ------------
423	!> Resorting data after the transposition from y to x. The transposition itself
424	!> is carried out in transpose_yx
425	!------------------------------------------------------------------------------!
426	SUBROUTINE resort_for_yx( f_inv, f_out )
427
428
429	USE indices, &
430	ONLY: nx
431
432	USE kinds
433
434	USE transpose_indices, &
435	ONLY: nyn_x, nys_x, nzb_x, nzt_x
436
437	IMPLICIT NONE
438
439	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
440	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
441
442
443	INTEGER(iwp) :: i !<
444	INTEGER(iwp) :: j !<
445	INTEGER(iwp) :: k !<
446	!
447	!-- Rearrange indices of input array in order to make data to be send
448	!-- by MPI contiguous
449	!$OMP PARALLEL PRIVATE ( i, j, k )
450	!$OMP DO
451	#if __acc_fft_device
452	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
453	!$ACC PRESENT(f_out, f_inv)
454	#endif
455	DO i = 0, nx
456	DO k = nzb_x, nzt_x
457	DO j = nys_x, nyn_x
458	f_out(i,j,k) = f_inv(j,k,i)
459	ENDDO
460	ENDDO
461	ENDDO
462	!$OMP END PARALLEL
463
464	END SUBROUTINE resort_for_yx
465
466
467	!------------------------------------------------------------------------------!
468	! Description:
469	! ------------
470	!> Transposition of input array (f_in) from y to x. For the input array, all
471	!> elements along y reside on the same PE, while after transposition, all
472	!> elements along x reside on the same PE.
473	!------------------------------------------------------------------------------!
474	SUBROUTINE transpose_yx( f_in, f_inv )
475
476
477	USE cpulog, &
478	ONLY: cpu_log, cpu_log_nowait, log_point_s
479
480	USE indices, &
481	ONLY: nx, ny
482
483	USE kinds
484
485	USE pegrid
486
487	USE transpose_indices, &
488	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
489
490	IMPLICIT NONE
491
492	INTEGER(iwp) :: i !<
493	INTEGER(iwp) :: j !<
494	INTEGER(iwp) :: k !<
495	INTEGER(iwp) :: l !<
496	INTEGER(iwp) :: ys !<
497
498	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
499	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
500
501	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
502	#if __acc_fft_device
503	!$ACC DECLARE CREATE(work)
504	#endif
505
506
507	IF ( numprocs /= 1 ) THEN
508
509	#if defined( __parallel )
510	!
511	!-- Reorder input array for transposition
512	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
513	!$OMP DO
514	DO l = 0, pdims(2) - 1
515	ys = 0 + l * ( nyn_x - nys_x + 1 )
516	#if __acc_fft_device
517	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
518	!$ACC PRESENT(work, f_in)
519	#endif
520	DO i = nxl_y, nxr_y
521	DO k = nzb_y, nzt_y
522	DO j = ys, ys + nyn_x - nys_x
523	work(j-ys+1,k,i,l) = f_in(j,i,k)
524	ENDDO
525	ENDDO
526	ENDDO
527	ENDDO
528	!$OMP END PARALLEL
529
530	!
531	!-- Transpose array
532	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
533
534	#if __acc_fft_device
535	#ifndef __cuda_aware_mpi
536	!$ACC UPDATE HOST(work)
537	#else
538	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
539	#endif
540	#endif
541
542	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
543	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
544	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
545	comm1dy, ierr )
546
547	#if __acc_fft_device
548	#ifndef __cuda_aware_mpi
549	!$ACC UPDATE DEVICE(f_inv)
550	#else
551	!$ACC END HOST_DATA
552	#endif
553	#endif
554
555	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
556	#endif
557
558	ELSE
559
560	!
561	!-- Reorder array f_in the same way as ALLTOALL did it
562	!$OMP PARALLEL PRIVATE ( i, j, k )
563	!$OMP DO
564	#if __acc_fft_device
565	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
566	!$ACC PRESENT(f_inv, f_in)
567	#endif
568	DO i = nxl_y, nxr_y
569	DO k = nzb_y, nzt_y
570	DO j = 0, ny
571	f_inv(j,k,i) = f_in(j,i,k)
572	ENDDO
573	ENDDO
574	ENDDO
575	!$OMP END PARALLEL
576
577	ENDIF
578
579	END SUBROUTINE transpose_yx
580
581
582	!------------------------------------------------------------------------------!
583	! Description:
584	! ------------
585	!> Transposition of input array (f_in) from y to x. For the input array, all
586	!> elements along y reside on the same PE, while after transposition, all
587	!> elements along x reside on the same PE.
588	!> This is a direct transposition for arrays with indices in regular order
589	!> (k,j,i) (cf. transpose_yx).
590	!------------------------------------------------------------------------------!
591	SUBROUTINE transpose_yxd( f_in, f_out )
592
593
594	USE cpulog, &
595	ONLY: cpu_log, log_point_s
596
597	USE indices, &
598	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
599
600	USE kinds
601
602	USE pegrid
603
604	USE transpose_indices, &
605	ONLY: nyn_x, nys_x, nzb_x, nzt_x
606
607	IMPLICIT NONE
608
609	INTEGER(iwp) :: i !<
610	INTEGER(iwp) :: j !<
611	INTEGER(iwp) :: k !<
612	INTEGER(iwp) :: l !<
613	INTEGER(iwp) :: m !<
614	INTEGER(iwp) :: xs !<
615
616	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
617	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
618	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
619	REAL(wp) :: work(nnxnnynnz) !<
620	#if defined( __parallel )
621
622	!
623	!-- Rearrange indices of input array in order to make data to be send
624	!-- by MPI contiguous
625	DO k = 1, nz
626	DO j = nys, nyn
627	DO i = nxl, nxr
628	f_inv(i,k,j) = f_in(k,j,i)
629	ENDDO
630	ENDDO
631	ENDDO
632
633	!
634	!-- Transpose array
635	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
636	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
637	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
638	work(1), sendrecvcount_xy, MPI_REAL, &
639	comm1dx, ierr )
640	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
641
642	!
643	!-- Reorder transposed array
644	m = 0
645	DO l = 0, pdims(1) - 1
646	xs = 0 + l * nnx
647	DO j = nys_x, nyn_x
648	DO k = 1, nz
649	DO i = xs, xs + nnx - 1
650	m = m + 1
651	f_out(i,j,k) = work(m)
652	ENDDO
653	ENDDO
654	ENDDO
655	ENDDO
656
657	#endif
658
659	END SUBROUTINE transpose_yxd
660
661
662	!------------------------------------------------------------------------------!
663	! Description:
664	! ------------
665	!> Resorting data for the transposition from y to z. The transposition itself
666	!> is carried out in transpose_yz
667	!------------------------------------------------------------------------------!
668	SUBROUTINE resort_for_yz( f_in, f_inv )
669
670
671	USE indices, &
672	ONLY: ny
673
674	USE kinds
675
676	USE transpose_indices, &
677	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
678
679	IMPLICIT NONE
680
681	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
682	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
683
684	INTEGER(iwp) :: i !<
685	INTEGER(iwp) :: j !<
686	INTEGER(iwp) :: k !<
687
688	!
689	!-- Rearrange indices of input array in order to make data to be send
690	!-- by MPI contiguous
691	!$OMP PARALLEL PRIVATE ( i, j, k )
692	!$OMP DO
693	#if __acc_fft_device
694	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
695	!$ACC PRESENT(f_inv, f_in)
696	#endif
697	DO j = 0, ny
698	DO k = nzb_y, nzt_y
699	DO i = nxl_y, nxr_y
700	f_inv(i,k,j) = f_in(j,i,k)
701	ENDDO
702	ENDDO
703	ENDDO
704	!$OMP END PARALLEL
705
706	END SUBROUTINE resort_for_yz
707
708
709	!------------------------------------------------------------------------------!
710	! Description:
711	! ------------
712	!> Transposition of input array (f_in) from y to z. For the input array, all
713	!> elements along y reside on the same PE, while after transposition, all
714	!> elements along z reside on the same PE.
715	!------------------------------------------------------------------------------!
716	SUBROUTINE transpose_yz( f_inv, f_out )
717
718
719	USE cpulog, &
720	ONLY: cpu_log, cpu_log_nowait, log_point_s
721
722	USE indices, &
723	ONLY: ny, nz
724
725	USE kinds
726
727	USE pegrid
728
729	USE transpose_indices, &
730	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
731
732	IMPLICIT NONE
733
734	INTEGER(iwp) :: i !<
735	INTEGER(iwp) :: j !<
736	INTEGER(iwp) :: k !<
737	INTEGER(iwp) :: l !<
738	INTEGER(iwp) :: zs !<
739
740	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
741	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
742
743	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
744	#if __acc_fft_device
745	!$ACC DECLARE CREATE(work)
746	#endif
747
748
749	!
750	!-- If the PE grid is one-dimensional along y, only local reordering
751	!-- of the data is necessary and no transposition has to be done.
752	IF ( pdims(1) == 1 ) THEN
753
754	!$OMP PARALLEL PRIVATE ( i, j, k )
755	!$OMP DO
756	#if __acc_fft_device
757	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
758	!$ACC PRESENT(f_out, f_inv)
759	#endif
760	DO j = 0, ny
761	DO k = nzb_y, nzt_y
762	DO i = nxl_y, nxr_y
763	f_out(i,j,k) = f_inv(i,k,j)
764	ENDDO
765	ENDDO
766	ENDDO
767	!$OMP END PARALLEL
768
769	ELSE
770
771	#if defined( __parallel )
772	!
773	!-- Transpose array
774	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
775
776	#if __acc_fft_device
777	#ifndef __cuda_aware_mpi
778	!$ACC UPDATE HOST(f_inv)
779	#else
780	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
781	#endif
782	#endif
783
784	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
785	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
786	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
787	comm1dx, ierr )
788
789	#if __acc_fft_device
790	#ifndef __cuda_aware_mpi
791	!$ACC UPDATE DEVICE(work)
792	#else
793	!$ACC END HOST_DATA
794	#endif
795	#endif
796
797	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
798
799	!
800	!-- Reorder transposed array
801	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
802	!$OMP DO
803	DO l = 0, pdims(1) - 1
804	zs = 1 + l * ( nzt_y - nzb_y + 1 )
805	#if __acc_fft_device
806	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
807	!$ACC PRESENT(f_out, work)
808	#endif
809	DO j = nys_z, nyn_z
810	DO k = zs, zs + nzt_y - nzb_y
811	DO i = nxl_z, nxr_z
812	f_out(i,j,k) = work(i,k-zs+1,j,l)
813	ENDDO
814	ENDDO
815	ENDDO
816	ENDDO
817	!$OMP END PARALLEL
818	#endif
819
820	ENDIF
821
822	END SUBROUTINE transpose_yz
823
824
825	!------------------------------------------------------------------------------!
826	! Description:
827	! ------------
828	!> Resorting data for the transposition from z to x. The transposition itself
829	!> is carried out in transpose_zx
830	!------------------------------------------------------------------------------!
831	SUBROUTINE resort_for_zx( f_in, f_inv )
832
833
834	USE indices, &
835	ONLY: nxl, nxr, nyn, nys, nz
836
837	USE kinds
838
839	IMPLICIT NONE
840
841	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
842	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
843
844	INTEGER(iwp) :: i !<
845	INTEGER(iwp) :: j !<
846	INTEGER(iwp) :: k !<
847
848	!
849	!-- Rearrange indices of input array in order to make data to be send
850	!-- by MPI contiguous
851	!$OMP PARALLEL PRIVATE ( i, j, k )
852	!$OMP DO
853	#if __acc_fft_device
854	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
855	!$ACC PRESENT(f_in, f_inv)
856	#endif
857	DO k = 1,nz
858	DO i = nxl, nxr
859	DO j = nys, nyn
860	f_inv(j,i,k) = f_in(k,j,i)
861	ENDDO
862	ENDDO
863	ENDDO
864	!$OMP END PARALLEL
865
866	END SUBROUTINE resort_for_zx
867
868
869	!------------------------------------------------------------------------------!
870	! Description:
871	! ------------
872	!> Transposition of input array (f_in) from z to x. For the input array, all
873	!> elements along z reside on the same PE, while after transposition, all
874	!> elements along x reside on the same PE.
875	!------------------------------------------------------------------------------!
876	SUBROUTINE transpose_zx( f_inv, f_out )
877
878
879	USE cpulog, &
880	ONLY: cpu_log, cpu_log_nowait, log_point_s
881
882	USE indices, &
883	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
884
885	USE kinds
886
887	USE pegrid
888
889	USE transpose_indices, &
890	ONLY: nyn_x, nys_x, nzb_x, nzt_x
891
892	IMPLICIT NONE
893
894	INTEGER(iwp) :: i !<
895	INTEGER(iwp) :: j !<
896	INTEGER(iwp) :: k !<
897	INTEGER(iwp) :: l !<
898	INTEGER(iwp) :: xs !<
899
900	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
901	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
902
903	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
904	#if __acc_fft_device
905	!$ACC DECLARE CREATE(work)
906	#endif
907
908
909	!
910	!-- If the PE grid is one-dimensional along y, only local reordering
911	!-- of the data is necessary and no transposition has to be done.
912	IF ( pdims(1) == 1 ) THEN
913
914	!$OMP PARALLEL PRIVATE ( i, j, k )
915	!$OMP DO
916	#if __acc_fft_device
917	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
918	!$ACC PRESENT(f_out, f_inv)
919	#endif
920	DO k = 1, nz
921	DO i = nxl, nxr
922	DO j = nys, nyn
923	f_out(i,j,k) = f_inv(j,i,k)
924	ENDDO
925	ENDDO
926	ENDDO
927	!$OMP END PARALLEL
928
929	ELSE
930
931	#if defined( __parallel )
932	!
933	!-- Transpose array
934	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
935
936	#if __acc_fft_device
937	#ifndef __cuda_aware_mpi
938	!$ACC UPDATE HOST(f_inv)
939	#else
940	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
941	#endif
942	#endif
943
944	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
945	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
946	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
947	comm1dx, ierr )
948
949	#if __acc_fft_device
950	#ifndef __cuda_aware_mpi
951	!$ACC UPDATE DEVICE(work)
952	#else
953	!$ACC END HOST_DATA
954	#endif
955	#endif
956
957	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
958
959	!
960	!-- Reorder transposed array
961	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
962	!$OMP DO
963	DO l = 0, pdims(1) - 1
964	xs = 0 + l * nnx
965	#if __acc_fft_device
966	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
967	!$ACC PRESENT(f_out, work)
968	#endif
969	DO k = nzb_x, nzt_x
970	DO i = xs, xs + nnx - 1
971	DO j = nys_x, nyn_x
972	f_out(i,j,k) = work(j,i-xs+1,k,l)
973	ENDDO
974	ENDDO
975	ENDDO
976	ENDDO
977	!$OMP END PARALLEL
978	#endif
979
980	ENDIF
981
982	END SUBROUTINE transpose_zx
983
984
985	!------------------------------------------------------------------------------!
986	! Description:
987	! ------------
988	!> Resorting data after the transposition from z to y. The transposition itself
989	!> is carried out in transpose_zy
990	!------------------------------------------------------------------------------!
991	SUBROUTINE resort_for_zy( f_inv, f_out )
992
993
994	USE indices, &
995	ONLY: ny
996
997	USE kinds
998
999	USE transpose_indices, &
1000	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1001
1002	IMPLICIT NONE
1003
1004	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1005	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1006
1007
1008	INTEGER(iwp) :: i !<
1009	INTEGER(iwp) :: j !<
1010	INTEGER(iwp) :: k !<
1011
1012	!
1013	!-- Rearrange indices of input array in order to make data to be send
1014	!-- by MPI contiguous
1015	!$OMP PARALLEL PRIVATE ( i, j, k )
1016	!$OMP DO
1017	#if __acc_fft_device
1018	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1019	!$ACC PRESENT(f_out, f_inv)
1020	#endif
1021	DO k = nzb_y, nzt_y
1022	DO j = 0, ny
1023	DO i = nxl_y, nxr_y
1024	f_out(j,i,k) = f_inv(i,k,j)
1025	ENDDO
1026	ENDDO
1027	ENDDO
1028	!$OMP END PARALLEL
1029
1030	END SUBROUTINE resort_for_zy
1031
1032
1033	!------------------------------------------------------------------------------!
1034	! Description:cpu_log_nowait
1035	! ------------
1036	!> Transposition of input array (f_in) from z to y. For the input array, all
1037	!> elements along z reside on the same PE, while after transposition, all
1038	!> elements along y reside on the same PE.
1039	!------------------------------------------------------------------------------!
1040	SUBROUTINE transpose_zy( f_in, f_inv )
1041
1042
1043	USE cpulog, &
1044	ONLY: cpu_log, cpu_log_nowait, log_point_s
1045
1046	USE indices, &
1047	ONLY: ny, nz
1048
1049	USE kinds
1050
1051	USE pegrid
1052
1053	USE transpose_indices, &
1054	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1055
1056	IMPLICIT NONE
1057
1058	INTEGER(iwp) :: i !<
1059	INTEGER(iwp) :: j !<
1060	INTEGER(iwp) :: k !<
1061	INTEGER(iwp) :: l !<
1062	INTEGER(iwp) :: zs !<
1063
1064	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1065	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1066
1067	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1068	#if __acc_fft_device
1069	!$ACC DECLARE CREATE(work)
1070	#endif
1071
1072	!
1073	!-- If the PE grid is one-dimensional along y, the array has only to be
1074	!-- reordered locally and therefore no transposition has to be done.
1075	IF ( pdims(1) /= 1 ) THEN
1076
1077	#if defined( __parallel )
1078	!
1079	!-- Reorder input array for transposition
1080	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1081	!$OMP DO
1082	DO l = 0, pdims(1) - 1
1083	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1084	#if __acc_fft_device
1085	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1086	!$ACC PRESENT(work, f_in)
1087	#endif
1088	DO j = nys_z, nyn_z
1089	DO k = zs, zs + nzt_y - nzb_y
1090	DO i = nxl_z, nxr_z
1091	work(i,k-zs+1,j,l) = f_in(i,j,k)
1092	ENDDO
1093	ENDDO
1094	ENDDO
1095	ENDDO
1096	!$OMP END PARALLEL
1097
1098	!
1099	!-- Transpose array
1100	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1101
1102	#if __acc_fft_device
1103	#ifndef __cuda_aware_mpi
1104	!$ACC UPDATE HOST(work)
1105	#else
1106	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1107	#endif
1108	#endif
1109
1110	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1111	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1112	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1113	comm1dx, ierr )
1114
1115	#if __acc_fft_device
1116	#ifndef __cuda_aware_mpi
1117	!$ACC UPDATE DEVICE(f_inv)
1118	#else
1119	!$ACC END HOST_DATA
1120	#endif
1121	#endif
1122
1123	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1124	#endif
1125
1126	ELSE
1127	!
1128	!-- Reorder the array in the same way like ALLTOALL did it
1129	!$OMP PARALLEL PRIVATE ( i, j, k )
1130	!$OMP DO
1131	#if __acc_fft_device
1132	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1133	!$ACC PRESENT(f_inv, f_in)
1134	#endif
1135	DO k = nzb_y, nzt_y
1136	DO j = 0, ny
1137	DO i = nxl_y, nxr_y
1138	f_inv(i,k,j) = f_in(i,j,k)
1139	ENDDO
1140	ENDDO
1141	ENDDO
1142	!$OMP END PARALLEL
1143
1144	ENDIF
1145
1146	END SUBROUTINE transpose_zy
1147
1148
1149	!------------------------------------------------------------------------------!
1150	! Description:
1151	! ------------
1152	!> Transposition of input array (f_in) from z to y. For the input array, all
1153	!> elements along z reside on the same PE, while after transposition, all
1154	!> elements along y reside on the same PE.
1155	!> This is a direct transposition for arrays with indices in regular order
1156	!> (k,j,i) (cf. transpose_zy).
1157	!------------------------------------------------------------------------------!
1158	SUBROUTINE transpose_zyd( f_in, f_out )
1159
1160
1161	USE cpulog, &
1162	ONLY: cpu_log, log_point_s
1163
1164	USE indices, &
1165	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1166
1167	USE kinds
1168
1169	USE pegrid
1170
1171	USE transpose_indices, &
1172	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1173
1174	IMPLICIT NONE
1175
1176	INTEGER(iwp) :: i !<
1177	INTEGER(iwp) :: j !<
1178	INTEGER(iwp) :: k !<
1179	INTEGER(iwp) :: l !<
1180	INTEGER(iwp) :: m !<
1181	INTEGER(iwp) :: ys !<
1182
1183	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1184	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1185	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1186	REAL(wp) :: work(nnxnnynnz) !<
1187
1188	#if defined( __parallel )
1189
1190	!
1191	!-- Rearrange indices of input array in order to make data to be send
1192	!-- by MPI contiguous
1193	DO i = nxl, nxr
1194	DO j = nys, nyn
1195	DO k = 1, nz
1196	f_inv(j,i,k) = f_in(k,j,i)
1197	ENDDO
1198	ENDDO
1199	ENDDO
1200
1201	!
1202	!-- Move data to different array, because memory location of work1 is
1203	!-- needed further below (work1 = work2).
1204	!-- If the PE grid is one-dimensional along x, only local reordering
1205	!-- of the data is necessary and no transposition has to be done.
1206	IF ( pdims(2) == 1 ) THEN
1207	DO k = 1, nz
1208	DO i = nxl, nxr
1209	DO j = nys, nyn
1210	f_out(j,i,k) = f_inv(j,i,k)
1211	ENDDO
1212	ENDDO
1213	ENDDO
1214	RETURN
1215	ENDIF
1216
1217	!
1218	!-- Transpose array
1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1220	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1221	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1222	work(1), sendrecvcount_zyd, MPI_REAL, &
1223	comm1dy, ierr )
1224	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1225
1226	!
1227	!-- Reorder transposed array
1228	m = 0
1229	DO l = 0, pdims(2) - 1
1230	ys = 0 + l * nny
1231	DO k = nzb_yd, nzt_yd
1232	DO i = nxl_yd, nxr_yd
1233	DO j = ys, ys + nny - 1
1234	m = m + 1
1235	f_out(j,i,k) = work(m)
1236	ENDDO
1237	ENDDO
1238	ENDDO
1239	ENDDO
1240
1241	#endif
1242
1243	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |