Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4366

Last change on this file since 4366 was 4366, checked in by raasch, 4 years ago
code vectorization for NEC Aurora: vectorized version of Temperton FFT, vectorization of Newtor iteration for calculating the Obukhov length
Property svn:keywords set to `Id`
File size: 34.0 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2020 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4366 2020-01-09 08:12:43Z raasch $
27	! modifications for NEC vectorization
28	!
29	! 4360 2020-01-07 11:25:50Z suehring
30	! Added missing OpenMP directives
31	!
32	! 4182 2019-08-22 15:20:23Z scharf
33	! Corrected "Former revisions" section
34	!
35	! 4171 2019-08-19 17:44:09Z gronemeier
36	! loop reordering for performance optimization
37	!
38	! 3832 2019-03-28 13:16:58Z raasch
39	! loop reordering for performance optimization
40	!
41	! 3694 2019-01-23 17:01:49Z knoop
42	! OpenACC port for SPEC
43	!
44	! Revision 1.1 1997/07/24 11:25:18 raasch
45	! Initial revision
46	!
47	!
48	! Description:
49	! ------------
50	!> Resorting data for the transposition from x to y. The transposition itself
51	!> is carried out in transpose_xy
52	!------------------------------------------------------------------------------!
53
54	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
55
56	SUBROUTINE resort_for_xy( f_in, f_inv )
57
58
59	USE indices, &
60	ONLY: nx
61
62	USE kinds
63
64	USE transpose_indices, &
65	ONLY: nyn_x, nys_x, nzb_x, nzt_x
66
67	IMPLICIT NONE
68
69	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
70	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
71
72
73	INTEGER(iwp) :: i !<
74	INTEGER(iwp) :: j !<
75	INTEGER(iwp) :: k !<
76	!
77	!-- Rearrange indices of input array in order to make data to be send
78	!-- by MPI contiguous
79	!$OMP PARALLEL PRIVATE ( i, j, k )
80	!$OMP DO
81	#if __acc_fft_device
82	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
83	!$ACC PRESENT(f_inv, f_in)
84	#endif
85	DO k = nzb_x, nzt_x
86	DO j = nys_x, nyn_x
87	DO i = 0, nx
88	f_inv(j,k,i) = f_in(i,j,k)
89	ENDDO
90	ENDDO
91	ENDDO
92	!$OMP END PARALLEL
93
94	END SUBROUTINE resort_for_xy
95
96
97	!------------------------------------------------------------------------------!
98	! Description:
99	! ------------
100	!> Transposition of input array (f_in) from x to y. For the input array, all
101	!> elements along x reside on the same PE, while after transposition, all
102	!> elements along y reside on the same PE.
103	!------------------------------------------------------------------------------!
104	SUBROUTINE transpose_xy( f_inv, f_out )
105
106
107	USE cpulog, &
108	ONLY: cpu_log, cpu_log_nowait, log_point_s
109
110	USE indices, &
111	ONLY: nx, ny
112
113	USE kinds
114
115	USE pegrid
116
117	USE transpose_indices, &
118	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
119
120	IMPLICIT NONE
121
122	INTEGER(iwp) :: i !<
123	INTEGER(iwp) :: j !<
124	INTEGER(iwp) :: k !<
125	INTEGER(iwp) :: l !<
126	INTEGER(iwp) :: ys !<
127
128	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
129	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
130
131	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
132	#if __acc_fft_device
133	!$ACC DECLARE CREATE(work)
134	#endif
135
136
137	IF ( numprocs /= 1 ) THEN
138
139	#if defined( __parallel )
140	!
141	!-- Transpose array
142	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
143
144	#if __acc_fft_device
145	#ifndef __cuda_aware_mpi
146	!$ACC UPDATE HOST(f_inv)
147	#else
148	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
149	#endif
150	#endif
151
152	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
153	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
154	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
155	comm1dy, ierr )
156
157	#if __acc_fft_device
158	#ifndef __cuda_aware_mpi
159	!$ACC UPDATE DEVICE(work)
160	#else
161	!$ACC END HOST_DATA
162	#endif
163	#endif
164
165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
166
167	!
168	!-- Reorder transposed array
169	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
170	DO l = 0, pdims(2) - 1
171	ys = 0 + l * ( nyn_x - nys_x + 1 )
172	#if __acc_fft_device
173	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
174	!$ACC PRESENT(f_out, work)
175	#endif
176	!$OMP DO
177	DO i = nxl_y, nxr_y
178	DO k = nzb_y, nzt_y
179	DO j = ys, ys + nyn_x - nys_x
180	f_out(j,i,k) = work(j-ys+1,k,i,l)
181	ENDDO
182	ENDDO
183	ENDDO
184	!$OMP END DO NOWAIT
185	ENDDO
186	!$OMP END PARALLEL
187	#endif
188
189	ELSE
190
191	!
192	!-- Reorder transposed array
193	!$OMP PARALLEL PRIVATE ( i, j, k )
194	!$OMP DO
195	#if __acc_fft_device
196	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
197	!$ACC PRESENT(f_out, f_inv)
198	#endif
199	DO k = nzb_y, nzt_y
200	DO i = nxl_y, nxr_y
201	DO j = 0, ny
202	f_out(j,i,k) = f_inv(j,k,i)
203	ENDDO
204	ENDDO
205	ENDDO
206	!$OMP END PARALLEL
207
208	ENDIF
209
210	END SUBROUTINE transpose_xy
211
212
213	!------------------------------------------------------------------------------!
214	! Description:
215	! ------------
216	!> Resorting data after the transposition from x to z. The transposition itself
217	!> is carried out in transpose_xz
218	!------------------------------------------------------------------------------!
219	SUBROUTINE resort_for_xz( f_inv, f_out )
220
221
222	USE indices, &
223	ONLY: nxl, nxr, nyn, nys, nz
224
225	USE kinds
226
227	IMPLICIT NONE
228
229	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
230	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
231
232	INTEGER(iwp) :: i !<
233	INTEGER(iwp) :: j !<
234	INTEGER(iwp) :: k !<
235	!
236	!-- Rearrange indices of input array in order to make data to be send
237	!-- by MPI contiguous.
238	!-- In case of parallel fft/transposition, scattered store is faster in
239	!-- backward direction!!!
240	!$OMP PARALLEL PRIVATE ( i, j, k )
241	!$OMP DO
242	#if __acc_fft_device
243	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
244	!$ACC PRESENT(f_out, f_inv)
245	#endif
246	DO i = nxl, nxr
247	DO j = nys, nyn
248	DO k = 1, nz
249	f_out(k,j,i) = f_inv(j,i,k)
250	ENDDO
251	ENDDO
252	ENDDO
253	!$OMP END PARALLEL
254
255	END SUBROUTINE resort_for_xz
256
257
258	!------------------------------------------------------------------------------!
259	! Description:
260	! ------------
261	!> Transposition of input array (f_in) from x to z. For the input array, all
262	!> elements along x reside on the same PE, while after transposition, all
263	!> elements along z reside on the same PE.
264	!------------------------------------------------------------------------------!
265	SUBROUTINE transpose_xz( f_in, f_inv )
266
267
268	USE cpulog, &
269	ONLY: cpu_log, cpu_log_nowait, log_point_s
270
271	USE fft_xy, &
272	ONLY: f_vec, temperton_fft_vec
273
274	USE indices, &
275	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
276
277	USE kinds
278
279	USE pegrid
280
281	USE transpose_indices, &
282	ONLY: nyn_x, nys_x, nzb_x, nzt_x
283
284	IMPLICIT NONE
285
286	INTEGER(iwp) :: i !<
287	INTEGER(iwp) :: j !<
288	INTEGER(iwp) :: k !<
289	INTEGER(iwp) :: l !<
290	INTEGER(iwp) :: mm !<
291	INTEGER(iwp) :: xs !<
292
293	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
294	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
295
296	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
297	#if __acc_fft_device
298	!$ACC DECLARE CREATE(work)
299	#endif
300
301	!
302	!-- If the PE grid is one-dimensional along y, the array has only to be
303	!-- reordered locally and therefore no transposition has to be done.
304	IF ( pdims(1) /= 1 ) THEN
305
306	#if defined( __parallel )
307	!
308	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
309	!-- different array format (f_vec).
310	IF ( temperton_fft_vec ) THEN
311
312	DO l = 0, pdims(1) - 1
313	xs = 0 + l * nnx
314	DO k = nzb_x, nzt_x
315	DO i = xs, xs + nnx - 1
316	DO j = nys_x, nyn_x
317	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
318	work(j,i-xs+1,k,l) = f_vec(mm,i)
319	ENDDO
320	ENDDO
321	ENDDO
322	ENDDO
323
324	ELSE
325
326	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
327	DO l = 0, pdims(1) - 1
328	xs = 0 + l * nnx
329	#if __acc_fft_device
330	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
331	!$ACC PRESENT(work, f_in)
332	#endif
333	!$OMP DO
334	DO k = nzb_x, nzt_x
335	DO i = xs, xs + nnx - 1
336	DO j = nys_x, nyn_x
337	work(j,i-xs+1,k,l) = f_in(i,j,k)
338	ENDDO
339	ENDDO
340	ENDDO
341	!$OMP END DO NOWAIT
342	ENDDO
343	!$OMP END PARALLEL
344
345	ENDIF
346
347	!
348	!-- Transpose array
349	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
350
351	#if __acc_fft_device
352	#ifndef __cuda_aware_mpi
353	!$ACC UPDATE HOST(work)
354	#else
355	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
356	#endif
357	#endif
358
359	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
360	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
361	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
362	comm1dx, ierr )
363
364	#if __acc_fft_device
365	#ifndef __cuda_aware_mpi
366	!$ACC UPDATE DEVICE(f_inv)
367	#else
368	!$ACC END HOST_DATA
369	#endif
370	#endif
371
372	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
373	#endif
374
375	ELSE
376
377	!
378	!-- Reorder the array in a way that the z index is in first position
379	!$OMP PARALLEL PRIVATE ( i, j, k )
380	!$OMP DO
381	#if __acc_fft_device
382	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
383	!$ACC PRESENT(f_inv, f_in)
384	#endif
385	DO i = nxl, nxr
386	DO j = nys, nyn
387	DO k = 1, nz
388	f_inv(j,i,k) = f_in(i,j,k)
389	ENDDO
390	ENDDO
391	ENDDO
392	!$OMP END PARALLEL
393
394	ENDIF
395
396	END SUBROUTINE transpose_xz
397
398
399	!------------------------------------------------------------------------------!
400	! Description:
401	! ------------
402	!> Resorting data after the transposition from y to x. The transposition itself
403	!> is carried out in transpose_yx
404	!------------------------------------------------------------------------------!
405	SUBROUTINE resort_for_yx( f_inv, f_out )
406
407
408	USE indices, &
409	ONLY: nx
410
411	USE kinds
412
413	USE transpose_indices, &
414	ONLY: nyn_x, nys_x, nzb_x, nzt_x
415
416	IMPLICIT NONE
417
418	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
419	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
420
421
422	INTEGER(iwp) :: i !<
423	INTEGER(iwp) :: j !<
424	INTEGER(iwp) :: k !<
425	!
426	!-- Rearrange indices of input array in order to make data to be send
427	!-- by MPI contiguous
428	!$OMP PARALLEL PRIVATE ( i, j, k )
429	!$OMP DO
430	#if __acc_fft_device
431	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
432	!$ACC PRESENT(f_out, f_inv)
433	#endif
434	DO k = nzb_x, nzt_x
435	DO j = nys_x, nyn_x
436	DO i = 0, nx
437	f_out(i,j,k) = f_inv(j,k,i)
438	ENDDO
439	ENDDO
440	ENDDO
441	!$OMP END PARALLEL
442
443	END SUBROUTINE resort_for_yx
444
445
446	!------------------------------------------------------------------------------!
447	! Description:
448	! ------------
449	!> Transposition of input array (f_in) from y to x. For the input array, all
450	!> elements along y reside on the same PE, while after transposition, all
451	!> elements along x reside on the same PE.
452	!------------------------------------------------------------------------------!
453	SUBROUTINE transpose_yx( f_in, f_inv )
454
455
456	USE cpulog, &
457	ONLY: cpu_log, cpu_log_nowait, log_point_s
458
459	USE indices, &
460	ONLY: nx, ny
461
462	USE kinds
463
464	USE pegrid
465
466	USE transpose_indices, &
467	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
468
469	IMPLICIT NONE
470
471	INTEGER(iwp) :: i !<
472	INTEGER(iwp) :: j !<
473	INTEGER(iwp) :: k !<
474	INTEGER(iwp) :: l !<
475	INTEGER(iwp) :: ys !<
476
477	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
478	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
479
480	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
481	#if __acc_fft_device
482	!$ACC DECLARE CREATE(work)
483	#endif
484
485
486	IF ( numprocs /= 1 ) THEN
487
488	#if defined( __parallel )
489	!
490	!-- Reorder input array for transposition
491	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
492	DO l = 0, pdims(2) - 1
493	ys = 0 + l * ( nyn_x - nys_x + 1 )
494	#if __acc_fft_device
495	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
496	!$ACC PRESENT(work, f_in)
497	#endif
498	!$OMP DO
499	DO i = nxl_y, nxr_y
500	DO k = nzb_y, nzt_y
501	DO j = ys, ys + nyn_x - nys_x
502	work(j-ys+1,k,i,l) = f_in(j,i,k)
503	ENDDO
504	ENDDO
505	ENDDO
506	!$OMP END DO NOWAIT
507	ENDDO
508	!$OMP END PARALLEL
509
510	!
511	!-- Transpose array
512	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
513
514	#if __acc_fft_device
515	#ifndef __cuda_aware_mpi
516	!$ACC UPDATE HOST(work)
517	#else
518	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
519	#endif
520	#endif
521
522	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
523	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
524	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
525	comm1dy, ierr )
526
527	#if __acc_fft_device
528	#ifndef __cuda_aware_mpi
529	!$ACC UPDATE DEVICE(f_inv)
530	#else
531	!$ACC END HOST_DATA
532	#endif
533	#endif
534
535	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
536	#endif
537
538	ELSE
539
540	!
541	!-- Reorder array f_in the same way as ALLTOALL did it
542	!$OMP PARALLEL PRIVATE ( i, j, k )
543	!$OMP DO
544	#if __acc_fft_device
545	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
546	!$ACC PRESENT(f_inv, f_in)
547	#endif
548	DO i = nxl_y, nxr_y
549	DO k = nzb_y, nzt_y
550	DO j = 0, ny
551	f_inv(j,k,i) = f_in(j,i,k)
552	ENDDO
553	ENDDO
554	ENDDO
555	!$OMP END PARALLEL
556
557	ENDIF
558
559	END SUBROUTINE transpose_yx
560
561
562	!------------------------------------------------------------------------------!
563	! Description:
564	! ------------
565	!> Transposition of input array (f_in) from y to x. For the input array, all
566	!> elements along y reside on the same PE, while after transposition, all
567	!> elements along x reside on the same PE.
568	!> This is a direct transposition for arrays with indices in regular order
569	!> (k,j,i) (cf. transpose_yx).
570	!------------------------------------------------------------------------------!
571	SUBROUTINE transpose_yxd( f_in, f_out )
572
573
574	USE cpulog, &
575	ONLY: cpu_log, log_point_s
576
577	USE indices, &
578	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
579
580	USE kinds
581
582	USE pegrid
583
584	USE transpose_indices, &
585	ONLY: nyn_x, nys_x, nzb_x, nzt_x
586
587	IMPLICIT NONE
588
589	INTEGER(iwp) :: i !<
590	INTEGER(iwp) :: j !<
591	INTEGER(iwp) :: k !<
592	INTEGER(iwp) :: l !<
593	INTEGER(iwp) :: m !<
594	INTEGER(iwp) :: xs !<
595
596	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
597	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
598	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
599	REAL(wp) :: work(nnxnnynnz) !<
600	#if defined( __parallel )
601
602	!
603	!-- Rearrange indices of input array in order to make data to be send
604	!-- by MPI contiguous
605	DO k = 1, nz
606	DO j = nys, nyn
607	DO i = nxl, nxr
608	f_inv(i,k,j) = f_in(k,j,i)
609	ENDDO
610	ENDDO
611	ENDDO
612
613	!
614	!-- Transpose array
615	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
616	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
617	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
618	work(1), sendrecvcount_xy, MPI_REAL, &
619	comm1dx, ierr )
620	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
621
622	!
623	!-- Reorder transposed array
624	m = 0
625	DO l = 0, pdims(1) - 1
626	xs = 0 + l * nnx
627	DO j = nys_x, nyn_x
628	DO k = 1, nz
629	DO i = xs, xs + nnx - 1
630	m = m + 1
631	f_out(i,j,k) = work(m)
632	ENDDO
633	ENDDO
634	ENDDO
635	ENDDO
636
637	#endif
638
639	END SUBROUTINE transpose_yxd
640
641
642	!------------------------------------------------------------------------------!
643	! Description:
644	! ------------
645	!> Resorting data for the transposition from y to z. The transposition itself
646	!> is carried out in transpose_yz
647	!------------------------------------------------------------------------------!
648	SUBROUTINE resort_for_yz( f_in, f_inv )
649
650
651	USE indices, &
652	ONLY: ny
653
654	USE kinds
655
656	USE transpose_indices, &
657	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
658
659	IMPLICIT NONE
660
661	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
662	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
663
664	INTEGER(iwp) :: i !<
665	INTEGER(iwp) :: j !<
666	INTEGER(iwp) :: k !<
667
668	!
669	!-- Rearrange indices of input array in order to make data to be send
670	!-- by MPI contiguous
671	!$OMP PARALLEL PRIVATE ( i, j, k )
672	!$OMP DO
673	#if __acc_fft_device
674	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
675	!$ACC PRESENT(f_inv, f_in)
676	#endif
677	DO k = nzb_y, nzt_y
678	DO i = nxl_y, nxr_y
679	DO j = 0, ny
680	f_inv(i,k,j) = f_in(j,i,k)
681	ENDDO
682	ENDDO
683	ENDDO
684	!$OMP END PARALLEL
685
686	END SUBROUTINE resort_for_yz
687
688
689	!------------------------------------------------------------------------------!
690	! Description:
691	! ------------
692	!> Transposition of input array (f_in) from y to z. For the input array, all
693	!> elements along y reside on the same PE, while after transposition, all
694	!> elements along z reside on the same PE.
695	!------------------------------------------------------------------------------!
696	SUBROUTINE transpose_yz( f_inv, f_out )
697
698
699	USE cpulog, &
700	ONLY: cpu_log, cpu_log_nowait, log_point_s
701
702	USE indices, &
703	ONLY: ny, nz
704
705	USE kinds
706
707	USE pegrid
708
709	USE transpose_indices, &
710	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
711
712	IMPLICIT NONE
713
714	INTEGER(iwp) :: i !<
715	INTEGER(iwp) :: j !<
716	INTEGER(iwp) :: k !<
717	INTEGER(iwp) :: l !<
718	INTEGER(iwp) :: zs !<
719
720	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
721	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
722
723	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
724	#if __acc_fft_device
725	!$ACC DECLARE CREATE(work)
726	#endif
727
728
729	!
730	!-- If the PE grid is one-dimensional along y, only local reordering
731	!-- of the data is necessary and no transposition has to be done.
732	IF ( pdims(1) == 1 ) THEN
733
734	!$OMP PARALLEL PRIVATE ( i, j, k )
735	!$OMP DO
736	#if __acc_fft_device
737	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
738	!$ACC PRESENT(f_out, f_inv)
739	#endif
740	DO j = 0, ny
741	DO k = nzb_y, nzt_y
742	DO i = nxl_y, nxr_y
743	f_out(i,j,k) = f_inv(i,k,j)
744	ENDDO
745	ENDDO
746	ENDDO
747	!$OMP END PARALLEL
748
749	ELSE
750
751	#if defined( __parallel )
752	!
753	!-- Transpose array
754	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
755
756	#if __acc_fft_device
757	#ifndef __cuda_aware_mpi
758	!$ACC UPDATE HOST(f_inv)
759	#else
760	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
761	#endif
762	#endif
763
764	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
765	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
766	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
767	comm1dx, ierr )
768
769	#if __acc_fft_device
770	#ifndef __cuda_aware_mpi
771	!$ACC UPDATE DEVICE(work)
772	#else
773	!$ACC END HOST_DATA
774	#endif
775	#endif
776
777	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
778
779	!
780	!-- Reorder transposed array
781	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
782	DO l = 0, pdims(1) - 1
783	zs = 1 + l * ( nzt_y - nzb_y + 1 )
784	#if __acc_fft_device
785	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
786	!$ACC PRESENT(f_out, work)
787	#endif
788	!$OMP DO
789	DO j = nys_z, nyn_z
790	DO k = zs, zs + nzt_y - nzb_y
791	DO i = nxl_z, nxr_z
792	f_out(i,j,k) = work(i,k-zs+1,j,l)
793	ENDDO
794	ENDDO
795	ENDDO
796	!$OMP END DO NOWAIT
797	ENDDO
798	!$OMP END PARALLEL
799	#endif
800
801	ENDIF
802
803	END SUBROUTINE transpose_yz
804
805
806	!------------------------------------------------------------------------------!
807	! Description:
808	! ------------
809	!> Resorting data for the transposition from z to x. The transposition itself
810	!> is carried out in transpose_zx
811	!------------------------------------------------------------------------------!
812	SUBROUTINE resort_for_zx( f_in, f_inv )
813
814
815	USE indices, &
816	ONLY: nxl, nxr, nyn, nys, nz
817
818	USE kinds
819
820	IMPLICIT NONE
821
822	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
823	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
824
825	INTEGER(iwp) :: i !<
826	INTEGER(iwp) :: j !<
827	INTEGER(iwp) :: k !<
828
829	!
830	!-- Rearrange indices of input array in order to make data to be send
831	!-- by MPI contiguous
832	!$OMP PARALLEL PRIVATE ( i, j, k )
833	!$OMP DO
834	#if __acc_fft_device
835	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
836	!$ACC PRESENT(f_in, f_inv)
837	#endif
838	DO i = nxl, nxr
839	DO j = nys, nyn
840	DO k = 1,nz
841	f_inv(j,i,k) = f_in(k,j,i)
842	ENDDO
843	ENDDO
844	ENDDO
845	!$OMP END PARALLEL
846
847	END SUBROUTINE resort_for_zx
848
849
850	!------------------------------------------------------------------------------!
851	! Description:
852	! ------------
853	!> Transposition of input array (f_in) from z to x. For the input array, all
854	!> elements along z reside on the same PE, while after transposition, all
855	!> elements along x reside on the same PE.
856	!------------------------------------------------------------------------------!
857	SUBROUTINE transpose_zx( f_inv, f_out )
858
859
860	USE cpulog, &
861	ONLY: cpu_log, cpu_log_nowait, log_point_s
862
863	USE fft_xy, &
864	ONLY: f_vec, temperton_fft_vec
865
866	USE indices, &
867	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
868
869	USE kinds
870
871	USE pegrid
872
873	USE transpose_indices, &
874	ONLY: nyn_x, nys_x, nzb_x, nzt_x
875
876	IMPLICIT NONE
877
878	INTEGER(iwp) :: i !<
879	INTEGER(iwp) :: j !<
880	INTEGER(iwp) :: k !<
881	INTEGER(iwp) :: l !<
882	INTEGER(iwp) :: mm !<
883	INTEGER(iwp) :: xs !<
884
885	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
886	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
887
888	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
889	#if __acc_fft_device
890	!$ACC DECLARE CREATE(work)
891	#endif
892
893
894	!
895	!-- If the PE grid is one-dimensional along y, only local reordering
896	!-- of the data is necessary and no transposition has to be done.
897	IF ( pdims(1) == 1 ) THEN
898
899	!$OMP PARALLEL PRIVATE ( i, j, k )
900	!$OMP DO
901	#if __acc_fft_device
902	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
903	!$ACC PRESENT(f_out, f_inv)
904	#endif
905	DO k = 1, nz
906	DO i = nxl, nxr
907	DO j = nys, nyn
908	f_out(i,j,k) = f_inv(j,i,k)
909	ENDDO
910	ENDDO
911	ENDDO
912	!$OMP END PARALLEL
913
914	ELSE
915
916	#if defined( __parallel )
917	!
918	!-- Transpose array
919	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
920
921	#if __acc_fft_device
922	#ifndef __cuda_aware_mpi
923	!$ACC UPDATE HOST(f_inv)
924	#else
925	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
926	#endif
927	#endif
928
929	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
930	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
931	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
932	comm1dx, ierr )
933
934	#if __acc_fft_device
935	#ifndef __cuda_aware_mpi
936	!$ACC UPDATE DEVICE(work)
937	#else
938	!$ACC END HOST_DATA
939	#endif
940	#endif
941
942	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
943
944	!
945	!-- Reorder transposed array.
946	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec) which saves
947	!-- additional data copy in fft_x.
948	IF ( temperton_fft_vec ) THEN
949
950	DO l = 0, pdims(1) - 1
951	xs = 0 + l * nnx
952	DO k = nzb_x, nzt_x
953	DO i = xs, xs + nnx - 1
954	DO j = nys_x, nyn_x
955	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
956	f_vec(mm,i) = work(j,i-xs+1,k,l)
957	ENDDO
958	ENDDO
959	ENDDO
960	ENDDO
961
962	ELSE
963
964	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
965	DO l = 0, pdims(1) - 1
966	xs = 0 + l * nnx
967	#if __acc_fft_device
968	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
969	!$ACC PRESENT(f_out, work)
970	#endif
971	!$OMP DO
972	DO k = nzb_x, nzt_x
973	DO i = xs, xs + nnx - 1
974	DO j = nys_x, nyn_x
975	f_out(i,j,k) = work(j,i-xs+1,k,l)
976	ENDDO
977	ENDDO
978	ENDDO
979	!$OMP END DO NOWAIT
980	ENDDO
981	!$OMP END PARALLEL
982	#endif
983
984	ENDIF
985
986	ENDIF
987
988	END SUBROUTINE transpose_zx
989
990
991	!------------------------------------------------------------------------------!
992	! Description:
993	! ------------
994	!> Resorting data after the transposition from z to y. The transposition itself
995	!> is carried out in transpose_zy
996	!------------------------------------------------------------------------------!
997	SUBROUTINE resort_for_zy( f_inv, f_out )
998
999
1000	USE indices, &
1001	ONLY: ny
1002
1003	USE kinds
1004
1005	USE transpose_indices, &
1006	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1007
1008	IMPLICIT NONE
1009
1010	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1011	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1012
1013
1014	INTEGER(iwp) :: i !<
1015	INTEGER(iwp) :: j !<
1016	INTEGER(iwp) :: k !<
1017
1018	!
1019	!-- Rearrange indices of input array in order to make data to be send
1020	!-- by MPI contiguous
1021	!$OMP PARALLEL PRIVATE ( i, j, k )
1022	!$OMP DO
1023	#if __acc_fft_device
1024	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1025	!$ACC PRESENT(f_out, f_inv)
1026	#endif
1027	DO k = nzb_y, nzt_y
1028	DO i = nxl_y, nxr_y
1029	DO j = 0, ny
1030	f_out(j,i,k) = f_inv(i,k,j)
1031	ENDDO
1032	ENDDO
1033	ENDDO
1034	!$OMP END PARALLEL
1035
1036	END SUBROUTINE resort_for_zy
1037
1038
1039	!------------------------------------------------------------------------------!
1040	! Description:cpu_log_nowait
1041	! ------------
1042	!> Transposition of input array (f_in) from z to y. For the input array, all
1043	!> elements along z reside on the same PE, while after transposition, all
1044	!> elements along y reside on the same PE.
1045	!------------------------------------------------------------------------------!
1046	SUBROUTINE transpose_zy( f_in, f_inv )
1047
1048
1049	USE cpulog, &
1050	ONLY: cpu_log, cpu_log_nowait, log_point_s
1051
1052	USE indices, &
1053	ONLY: ny, nz
1054
1055	USE kinds
1056
1057	USE pegrid
1058
1059	USE transpose_indices, &
1060	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1061
1062	IMPLICIT NONE
1063
1064	INTEGER(iwp) :: i !<
1065	INTEGER(iwp) :: j !<
1066	INTEGER(iwp) :: k !<
1067	INTEGER(iwp) :: l !<
1068	INTEGER(iwp) :: zs !<
1069
1070	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1071	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1072
1073	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1074	#if __acc_fft_device
1075	!$ACC DECLARE CREATE(work)
1076	#endif
1077
1078	!
1079	!-- If the PE grid is one-dimensional along y, the array has only to be
1080	!-- reordered locally and therefore no transposition has to be done.
1081	IF ( pdims(1) /= 1 ) THEN
1082
1083	#if defined( __parallel )
1084	!
1085	!-- Reorder input array for transposition
1086	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1087	DO l = 0, pdims(1) - 1
1088	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1089	#if __acc_fft_device
1090	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1091	!$ACC PRESENT(work, f_in)
1092	#endif
1093	!$OMP DO
1094	DO j = nys_z, nyn_z
1095	DO k = zs, zs + nzt_y - nzb_y
1096	DO i = nxl_z, nxr_z
1097	work(i,k-zs+1,j,l) = f_in(i,j,k)
1098	ENDDO
1099	ENDDO
1100	ENDDO
1101	!$OMP END DO NOWAIT
1102	ENDDO
1103	!$OMP END PARALLEL
1104
1105	!
1106	!-- Transpose array
1107	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1108
1109	#if __acc_fft_device
1110	#ifndef __cuda_aware_mpi
1111	!$ACC UPDATE HOST(work)
1112	#else
1113	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1114	#endif
1115	#endif
1116
1117	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1118	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1119	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1120	comm1dx, ierr )
1121
1122	#if __acc_fft_device
1123	#ifndef __cuda_aware_mpi
1124	!$ACC UPDATE DEVICE(f_inv)
1125	#else
1126	!$ACC END HOST_DATA
1127	#endif
1128	#endif
1129
1130	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1131	#endif
1132
1133	ELSE
1134	!
1135	!-- Reorder the array in the same way like ALLTOALL did it
1136	!$OMP PARALLEL PRIVATE ( i, j, k )
1137	!$OMP DO
1138	#if __acc_fft_device
1139	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1140	!$ACC PRESENT(f_inv, f_in)
1141	#endif
1142	DO k = nzb_y, nzt_y
1143	DO j = 0, ny
1144	DO i = nxl_y, nxr_y
1145	f_inv(i,k,j) = f_in(i,j,k)
1146	ENDDO
1147	ENDDO
1148	ENDDO
1149	!$OMP END PARALLEL
1150
1151	ENDIF
1152
1153	END SUBROUTINE transpose_zy
1154
1155
1156	!------------------------------------------------------------------------------!
1157	! Description:
1158	! ------------
1159	!> Transposition of input array (f_in) from z to y. For the input array, all
1160	!> elements along z reside on the same PE, while after transposition, all
1161	!> elements along y reside on the same PE.
1162	!> This is a direct transposition for arrays with indices in regular order
1163	!> (k,j,i) (cf. transpose_zy).
1164	!------------------------------------------------------------------------------!
1165	SUBROUTINE transpose_zyd( f_in, f_out )
1166
1167
1168	USE cpulog, &
1169	ONLY: cpu_log, log_point_s
1170
1171	USE indices, &
1172	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1173
1174	USE kinds
1175
1176	USE pegrid
1177
1178	USE transpose_indices, &
1179	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1180
1181	IMPLICIT NONE
1182
1183	INTEGER(iwp) :: i !<
1184	INTEGER(iwp) :: j !<
1185	INTEGER(iwp) :: k !<
1186	INTEGER(iwp) :: l !<
1187	INTEGER(iwp) :: m !<
1188	INTEGER(iwp) :: ys !<
1189
1190	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1191	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1192	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1193	REAL(wp) :: work(nnxnnynnz) !<
1194
1195	#if defined( __parallel )
1196
1197	!
1198	!-- Rearrange indices of input array in order to make data to be send
1199	!-- by MPI contiguous
1200	DO i = nxl, nxr
1201	DO j = nys, nyn
1202	DO k = 1, nz
1203	f_inv(j,i,k) = f_in(k,j,i)
1204	ENDDO
1205	ENDDO
1206	ENDDO
1207
1208	!
1209	!-- Move data to different array, because memory location of work1 is
1210	!-- needed further below (work1 = work2).
1211	!-- If the PE grid is one-dimensional along x, only local reordering
1212	!-- of the data is necessary and no transposition has to be done.
1213	IF ( pdims(2) == 1 ) THEN
1214	DO k = 1, nz
1215	DO i = nxl, nxr
1216	DO j = nys, nyn
1217	f_out(j,i,k) = f_inv(j,i,k)
1218	ENDDO
1219	ENDDO
1220	ENDDO
1221	RETURN
1222	ENDIF
1223
1224	!
1225	!-- Transpose array
1226	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1227	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1228	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1229	work(1), sendrecvcount_zyd, MPI_REAL, &
1230	comm1dy, ierr )
1231	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1232
1233	!
1234	!-- Reorder transposed array
1235	m = 0
1236	DO l = 0, pdims(2) - 1
1237	ys = 0 + l * nny
1238	DO k = nzb_yd, nzt_yd
1239	DO i = nxl_yd, nxr_yd
1240	DO j = ys, ys + nny - 1
1241	m = m + 1
1242	f_out(j,i,k) = work(m)
1243	ENDDO
1244	ENDDO
1245	ENDDO
1246	ENDDO
1247
1248	#endif
1249
1250	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |