Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4336

Last change on this file since 4336 was 4236, checked in by knoop, 5 years ago
Added missing OpenMP directives within "transpose" and "tridia_solver"
Property svn:keywords set to `Id`
File size: 32.4 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4236 2019-09-25 11:26:18Z raasch $
27	! Added missing OpenMP directives
28	!
29	! 4182 2019-08-22 15:20:23Z scharf
30	! Corrected "Former revisions" section
31	!
32	! 4171 2019-08-19 17:44:09Z gronemeier
33	! loop reordering for performance optimization
34	!
35	! 3832 2019-03-28 13:16:58Z raasch
36	! loop reordering for performance optimization
37	!
38	! 3694 2019-01-23 17:01:49Z knoop
39	! OpenACC port for SPEC
40	!
41	! Revision 1.1 1997/07/24 11:25:18 raasch
42	! Initial revision
43	!
44	!
45	! Description:
46	! ------------
47	!> Resorting data for the transposition from x to y. The transposition itself
48	!> is carried out in transpose_xy
49	!------------------------------------------------------------------------------!
50
51	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
52
53	SUBROUTINE resort_for_xy( f_in, f_inv )
54
55
56	USE indices, &
57	ONLY: nx
58
59	USE kinds
60
61	USE transpose_indices, &
62	ONLY: nyn_x, nys_x, nzb_x, nzt_x
63
64	IMPLICIT NONE
65
66	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
67	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
68
69
70	INTEGER(iwp) :: i !<
71	INTEGER(iwp) :: j !<
72	INTEGER(iwp) :: k !<
73	!
74	!-- Rearrange indices of input array in order to make data to be send
75	!-- by MPI contiguous
76	!$OMP PARALLEL PRIVATE ( i, j, k )
77	!$OMP DO
78	#if __acc_fft_device
79	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
80	!$ACC PRESENT(f_inv, f_in)
81	#endif
82	DO k = nzb_x, nzt_x
83	DO j = nys_x, nyn_x
84	DO i = 0, nx
85	f_inv(j,k,i) = f_in(i,j,k)
86	ENDDO
87	ENDDO
88	ENDDO
89	!$OMP END PARALLEL
90
91	END SUBROUTINE resort_for_xy
92
93
94	!------------------------------------------------------------------------------!
95	! Description:
96	! ------------
97	!> Transposition of input array (f_in) from x to y. For the input array, all
98	!> elements along x reside on the same PE, while after transposition, all
99	!> elements along y reside on the same PE.
100	!------------------------------------------------------------------------------!
101	SUBROUTINE transpose_xy( f_inv, f_out )
102
103
104	USE cpulog, &
105	ONLY: cpu_log, cpu_log_nowait, log_point_s
106
107	USE indices, &
108	ONLY: nx, ny
109
110	USE kinds
111
112	USE pegrid
113
114	USE transpose_indices, &
115	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
116
117	IMPLICIT NONE
118
119	INTEGER(iwp) :: i !<
120	INTEGER(iwp) :: j !<
121	INTEGER(iwp) :: k !<
122	INTEGER(iwp) :: l !<
123	INTEGER(iwp) :: ys !<
124
125	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
126	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
127
128	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
129	#if __acc_fft_device
130	!$ACC DECLARE CREATE(work)
131	#endif
132
133
134	IF ( numprocs /= 1 ) THEN
135
136	#if defined( __parallel )
137	!
138	!-- Transpose array
139	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
140
141	#if __acc_fft_device
142	#ifndef __cuda_aware_mpi
143	!$ACC UPDATE HOST(f_inv)
144	#else
145	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
146	#endif
147	#endif
148
149	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
150	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
151	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
152	comm1dy, ierr )
153
154	#if __acc_fft_device
155	#ifndef __cuda_aware_mpi
156	!$ACC UPDATE DEVICE(work)
157	#else
158	!$ACC END HOST_DATA
159	#endif
160	#endif
161
162	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
163
164	!
165	!-- Reorder transposed array
166	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
167	DO l = 0, pdims(2) - 1
168	ys = 0 + l * ( nyn_x - nys_x + 1 )
169	#if __acc_fft_device
170	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
171	!$ACC PRESENT(f_out, work)
172	#endif
173	!$OMP DO
174	DO i = nxl_y, nxr_y
175	DO k = nzb_y, nzt_y
176	DO j = ys, ys + nyn_x - nys_x
177	f_out(j,i,k) = work(j-ys+1,k,i,l)
178	ENDDO
179	ENDDO
180	ENDDO
181	!$OMP END DO NOWAIT
182	ENDDO
183	!$OMP END PARALLEL
184	#endif
185
186	ELSE
187
188	!
189	!-- Reorder transposed array
190	!$OMP PARALLEL PRIVATE ( i, j, k )
191	!$OMP DO
192	#if __acc_fft_device
193	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
194	!$ACC PRESENT(f_out, f_inv)
195	#endif
196	DO k = nzb_y, nzt_y
197	DO i = nxl_y, nxr_y
198	DO j = 0, ny
199	f_out(j,i,k) = f_inv(j,k,i)
200	ENDDO
201	ENDDO
202	ENDDO
203	!$OMP END PARALLEL
204
205	ENDIF
206
207	END SUBROUTINE transpose_xy
208
209
210	!------------------------------------------------------------------------------!
211	! Description:
212	! ------------
213	!> Resorting data after the transposition from x to z. The transposition itself
214	!> is carried out in transpose_xz
215	!------------------------------------------------------------------------------!
216	SUBROUTINE resort_for_xz( f_inv, f_out )
217
218
219	USE indices, &
220	ONLY: nxl, nxr, nyn, nys, nz
221
222	USE kinds
223
224	IMPLICIT NONE
225
226	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
227	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
228
229	INTEGER(iwp) :: i !<
230	INTEGER(iwp) :: j !<
231	INTEGER(iwp) :: k !<
232	!
233	!-- Rearrange indices of input array in order to make data to be send
234	!-- by MPI contiguous.
235	!-- In case of parallel fft/transposition, scattered store is faster in
236	!-- backward direction!!!
237	!$OMP PARALLEL PRIVATE ( i, j, k )
238	!$OMP DO
239	#if __acc_fft_device
240	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
241	!$ACC PRESENT(f_out, f_inv)
242	#endif
243	DO i = nxl, nxr
244	DO j = nys, nyn
245	DO k = 1, nz
246	f_out(k,j,i) = f_inv(j,i,k)
247	ENDDO
248	ENDDO
249	ENDDO
250	!$OMP END PARALLEL
251
252	END SUBROUTINE resort_for_xz
253
254
255	!------------------------------------------------------------------------------!
256	! Description:
257	! ------------
258	!> Transposition of input array (f_in) from x to z. For the input array, all
259	!> elements along x reside on the same PE, while after transposition, all
260	!> elements along z reside on the same PE.
261	!------------------------------------------------------------------------------!
262	SUBROUTINE transpose_xz( f_in, f_inv )
263
264
265	USE cpulog, &
266	ONLY: cpu_log, cpu_log_nowait, log_point_s
267
268	USE indices, &
269	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
270
271	USE kinds
272
273	USE pegrid
274
275	USE transpose_indices, &
276	ONLY: nyn_x, nys_x, nzb_x, nzt_x
277
278	IMPLICIT NONE
279
280	INTEGER(iwp) :: i !<
281	INTEGER(iwp) :: j !<
282	INTEGER(iwp) :: k !<
283	INTEGER(iwp) :: l !<
284	INTEGER(iwp) :: xs !<
285
286	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
287	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
288
289	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
290	#if __acc_fft_device
291	!$ACC DECLARE CREATE(work)
292	#endif
293
294
295	!
296	!-- If the PE grid is one-dimensional along y, the array has only to be
297	!-- reordered locally and therefore no transposition has to be done.
298	IF ( pdims(1) /= 1 ) THEN
299
300	#if defined( __parallel )
301	!
302	!-- Reorder input array for transposition
303	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
304	DO l = 0, pdims(1) - 1
305	xs = 0 + l * nnx
306	#if __acc_fft_device
307	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
308	!$ACC PRESENT(work, f_in)
309	#endif
310	!$OMP DO
311	DO k = nzb_x, nzt_x
312	DO i = xs, xs + nnx - 1
313	DO j = nys_x, nyn_x
314	work(j,i-xs+1,k,l) = f_in(i,j,k)
315	ENDDO
316	ENDDO
317	ENDDO
318	!$OMP END DO NOWAIT
319	ENDDO
320	!$OMP END PARALLEL
321
322	!
323	!-- Transpose array
324	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
325
326	#if __acc_fft_device
327	#ifndef __cuda_aware_mpi
328	!$ACC UPDATE HOST(work)
329	#else
330	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
331	#endif
332	#endif
333
334	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
335	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
336	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
337	comm1dx, ierr )
338
339	#if __acc_fft_device
340	#ifndef __cuda_aware_mpi
341	!$ACC UPDATE DEVICE(f_inv)
342	#else
343	!$ACC END HOST_DATA
344	#endif
345	#endif
346
347	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
348	#endif
349
350	ELSE
351
352	!
353	!-- Reorder the array in a way that the z index is in first position
354	!$OMP PARALLEL PRIVATE ( i, j, k )
355	!$OMP DO
356	#if __acc_fft_device
357	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
358	!$ACC PRESENT(f_inv, f_in)
359	#endif
360	DO i = nxl, nxr
361	DO j = nys, nyn
362	DO k = 1, nz
363	f_inv(j,i,k) = f_in(i,j,k)
364	ENDDO
365	ENDDO
366	ENDDO
367	!$OMP END PARALLEL
368
369	ENDIF
370
371	END SUBROUTINE transpose_xz
372
373
374	!------------------------------------------------------------------------------!
375	! Description:
376	! ------------
377	!> Resorting data after the transposition from y to x. The transposition itself
378	!> is carried out in transpose_yx
379	!------------------------------------------------------------------------------!
380	SUBROUTINE resort_for_yx( f_inv, f_out )
381
382
383	USE indices, &
384	ONLY: nx
385
386	USE kinds
387
388	USE transpose_indices, &
389	ONLY: nyn_x, nys_x, nzb_x, nzt_x
390
391	IMPLICIT NONE
392
393	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
394	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
395
396
397	INTEGER(iwp) :: i !<
398	INTEGER(iwp) :: j !<
399	INTEGER(iwp) :: k !<
400	!
401	!-- Rearrange indices of input array in order to make data to be send
402	!-- by MPI contiguous
403	!$OMP PARALLEL PRIVATE ( i, j, k )
404	!$OMP DO
405	#if __acc_fft_device
406	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
407	!$ACC PRESENT(f_out, f_inv)
408	#endif
409	DO k = nzb_x, nzt_x
410	DO j = nys_x, nyn_x
411	DO i = 0, nx
412	f_out(i,j,k) = f_inv(j,k,i)
413	ENDDO
414	ENDDO
415	ENDDO
416	!$OMP END PARALLEL
417
418	END SUBROUTINE resort_for_yx
419
420
421	!------------------------------------------------------------------------------!
422	! Description:
423	! ------------
424	!> Transposition of input array (f_in) from y to x. For the input array, all
425	!> elements along y reside on the same PE, while after transposition, all
426	!> elements along x reside on the same PE.
427	!------------------------------------------------------------------------------!
428	SUBROUTINE transpose_yx( f_in, f_inv )
429
430
431	USE cpulog, &
432	ONLY: cpu_log, cpu_log_nowait, log_point_s
433
434	USE indices, &
435	ONLY: nx, ny
436
437	USE kinds
438
439	USE pegrid
440
441	USE transpose_indices, &
442	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
443
444	IMPLICIT NONE
445
446	INTEGER(iwp) :: i !<
447	INTEGER(iwp) :: j !<
448	INTEGER(iwp) :: k !<
449	INTEGER(iwp) :: l !<
450	INTEGER(iwp) :: ys !<
451
452	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
453	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
454
455	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
456	#if __acc_fft_device
457	!$ACC DECLARE CREATE(work)
458	#endif
459
460
461	IF ( numprocs /= 1 ) THEN
462
463	#if defined( __parallel )
464	!
465	!-- Reorder input array for transposition
466	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
467	DO l = 0, pdims(2) - 1
468	ys = 0 + l * ( nyn_x - nys_x + 1 )
469	#if __acc_fft_device
470	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
471	!$ACC PRESENT(work, f_in)
472	#endif
473	!$OMP DO
474	DO i = nxl_y, nxr_y
475	DO k = nzb_y, nzt_y
476	DO j = ys, ys + nyn_x - nys_x
477	work(j-ys+1,k,i,l) = f_in(j,i,k)
478	ENDDO
479	ENDDO
480	ENDDO
481	!$OMP END DO NOWAIT
482	ENDDO
483	!$OMP END PARALLEL
484
485	!
486	!-- Transpose array
487	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
488
489	#if __acc_fft_device
490	#ifndef __cuda_aware_mpi
491	!$ACC UPDATE HOST(work)
492	#else
493	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
494	#endif
495	#endif
496
497	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
498	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
499	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
500	comm1dy, ierr )
501
502	#if __acc_fft_device
503	#ifndef __cuda_aware_mpi
504	!$ACC UPDATE DEVICE(f_inv)
505	#else
506	!$ACC END HOST_DATA
507	#endif
508	#endif
509
510	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
511	#endif
512
513	ELSE
514
515	!
516	!-- Reorder array f_in the same way as ALLTOALL did it
517	!$OMP PARALLEL PRIVATE ( i, j, k )
518	!$OMP DO
519	#if __acc_fft_device
520	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
521	!$ACC PRESENT(f_inv, f_in)
522	#endif
523	DO i = nxl_y, nxr_y
524	DO k = nzb_y, nzt_y
525	DO j = 0, ny
526	f_inv(j,k,i) = f_in(j,i,k)
527	ENDDO
528	ENDDO
529	ENDDO
530	!$OMP END PARALLEL
531
532	ENDIF
533
534	END SUBROUTINE transpose_yx
535
536
537	!------------------------------------------------------------------------------!
538	! Description:
539	! ------------
540	!> Transposition of input array (f_in) from y to x. For the input array, all
541	!> elements along y reside on the same PE, while after transposition, all
542	!> elements along x reside on the same PE.
543	!> This is a direct transposition for arrays with indices in regular order
544	!> (k,j,i) (cf. transpose_yx).
545	!------------------------------------------------------------------------------!
546	SUBROUTINE transpose_yxd( f_in, f_out )
547
548
549	USE cpulog, &
550	ONLY: cpu_log, log_point_s
551
552	USE indices, &
553	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
554
555	USE kinds
556
557	USE pegrid
558
559	USE transpose_indices, &
560	ONLY: nyn_x, nys_x, nzb_x, nzt_x
561
562	IMPLICIT NONE
563
564	INTEGER(iwp) :: i !<
565	INTEGER(iwp) :: j !<
566	INTEGER(iwp) :: k !<
567	INTEGER(iwp) :: l !<
568	INTEGER(iwp) :: m !<
569	INTEGER(iwp) :: xs !<
570
571	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
572	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
573	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
574	REAL(wp) :: work(nnxnnynnz) !<
575	#if defined( __parallel )
576
577	!
578	!-- Rearrange indices of input array in order to make data to be send
579	!-- by MPI contiguous
580	DO k = 1, nz
581	DO j = nys, nyn
582	DO i = nxl, nxr
583	f_inv(i,k,j) = f_in(k,j,i)
584	ENDDO
585	ENDDO
586	ENDDO
587
588	!
589	!-- Transpose array
590	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
591	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
592	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
593	work(1), sendrecvcount_xy, MPI_REAL, &
594	comm1dx, ierr )
595	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
596
597	!
598	!-- Reorder transposed array
599	m = 0
600	DO l = 0, pdims(1) - 1
601	xs = 0 + l * nnx
602	DO j = nys_x, nyn_x
603	DO k = 1, nz
604	DO i = xs, xs + nnx - 1
605	m = m + 1
606	f_out(i,j,k) = work(m)
607	ENDDO
608	ENDDO
609	ENDDO
610	ENDDO
611
612	#endif
613
614	END SUBROUTINE transpose_yxd
615
616
617	!------------------------------------------------------------------------------!
618	! Description:
619	! ------------
620	!> Resorting data for the transposition from y to z. The transposition itself
621	!> is carried out in transpose_yz
622	!------------------------------------------------------------------------------!
623	SUBROUTINE resort_for_yz( f_in, f_inv )
624
625
626	USE indices, &
627	ONLY: ny
628
629	USE kinds
630
631	USE transpose_indices, &
632	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
633
634	IMPLICIT NONE
635
636	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
637	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
638
639	INTEGER(iwp) :: i !<
640	INTEGER(iwp) :: j !<
641	INTEGER(iwp) :: k !<
642
643	!
644	!-- Rearrange indices of input array in order to make data to be send
645	!-- by MPI contiguous
646	!$OMP PARALLEL PRIVATE ( i, j, k )
647	!$OMP DO
648	#if __acc_fft_device
649	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
650	!$ACC PRESENT(f_inv, f_in)
651	#endif
652	DO k = nzb_y, nzt_y
653	DO i = nxl_y, nxr_y
654	DO j = 0, ny
655	f_inv(i,k,j) = f_in(j,i,k)
656	ENDDO
657	ENDDO
658	ENDDO
659	!$OMP END PARALLEL
660
661	END SUBROUTINE resort_for_yz
662
663
664	!------------------------------------------------------------------------------!
665	! Description:
666	! ------------
667	!> Transposition of input array (f_in) from y to z. For the input array, all
668	!> elements along y reside on the same PE, while after transposition, all
669	!> elements along z reside on the same PE.
670	!------------------------------------------------------------------------------!
671	SUBROUTINE transpose_yz( f_inv, f_out )
672
673
674	USE cpulog, &
675	ONLY: cpu_log, cpu_log_nowait, log_point_s
676
677	USE indices, &
678	ONLY: ny, nz
679
680	USE kinds
681
682	USE pegrid
683
684	USE transpose_indices, &
685	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
686
687	IMPLICIT NONE
688
689	INTEGER(iwp) :: i !<
690	INTEGER(iwp) :: j !<
691	INTEGER(iwp) :: k !<
692	INTEGER(iwp) :: l !<
693	INTEGER(iwp) :: zs !<
694
695	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
696	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
697
698	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
699	#if __acc_fft_device
700	!$ACC DECLARE CREATE(work)
701	#endif
702
703
704	!
705	!-- If the PE grid is one-dimensional along y, only local reordering
706	!-- of the data is necessary and no transposition has to be done.
707	IF ( pdims(1) == 1 ) THEN
708
709	!$OMP PARALLEL PRIVATE ( i, j, k )
710	!$OMP DO
711	#if __acc_fft_device
712	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
713	!$ACC PRESENT(f_out, f_inv)
714	#endif
715	DO j = 0, ny
716	DO k = nzb_y, nzt_y
717	DO i = nxl_y, nxr_y
718	f_out(i,j,k) = f_inv(i,k,j)
719	ENDDO
720	ENDDO
721	ENDDO
722	!$OMP END PARALLEL
723
724	ELSE
725
726	#if defined( __parallel )
727	!
728	!-- Transpose array
729	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
730
731	#if __acc_fft_device
732	#ifndef __cuda_aware_mpi
733	!$ACC UPDATE HOST(f_inv)
734	#else
735	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
736	#endif
737	#endif
738
739	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
740	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
741	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
742	comm1dx, ierr )
743
744	#if __acc_fft_device
745	#ifndef __cuda_aware_mpi
746	!$ACC UPDATE DEVICE(work)
747	#else
748	!$ACC END HOST_DATA
749	#endif
750	#endif
751
752	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
753
754	!
755	!-- Reorder transposed array
756	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
757	DO l = 0, pdims(1) - 1
758	zs = 1 + l * ( nzt_y - nzb_y + 1 )
759	#if __acc_fft_device
760	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
761	!$ACC PRESENT(f_out, work)
762	#endif
763	!$OMP DO
764	DO j = nys_z, nyn_z
765	DO k = zs, zs + nzt_y - nzb_y
766	DO i = nxl_z, nxr_z
767	f_out(i,j,k) = work(i,k-zs+1,j,l)
768	ENDDO
769	ENDDO
770	ENDDO
771	!$OMP END DO NOWAIT
772	ENDDO
773	!$OMP END PARALLEL
774	#endif
775
776	ENDIF
777
778	END SUBROUTINE transpose_yz
779
780
781	!------------------------------------------------------------------------------!
782	! Description:
783	! ------------
784	!> Resorting data for the transposition from z to x. The transposition itself
785	!> is carried out in transpose_zx
786	!------------------------------------------------------------------------------!
787	SUBROUTINE resort_for_zx( f_in, f_inv )
788
789
790	USE indices, &
791	ONLY: nxl, nxr, nyn, nys, nz
792
793	USE kinds
794
795	IMPLICIT NONE
796
797	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
798	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
799
800	INTEGER(iwp) :: i !<
801	INTEGER(iwp) :: j !<
802	INTEGER(iwp) :: k !<
803
804	!
805	!-- Rearrange indices of input array in order to make data to be send
806	!-- by MPI contiguous
807	!$OMP PARALLEL PRIVATE ( i, j, k )
808	!$OMP DO
809	#if __acc_fft_device
810	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
811	!$ACC PRESENT(f_in, f_inv)
812	#endif
813	DO i = nxl, nxr
814	DO j = nys, nyn
815	DO k = 1,nz
816	f_inv(j,i,k) = f_in(k,j,i)
817	ENDDO
818	ENDDO
819	ENDDO
820	!$OMP END PARALLEL
821
822	END SUBROUTINE resort_for_zx
823
824
825	!------------------------------------------------------------------------------!
826	! Description:
827	! ------------
828	!> Transposition of input array (f_in) from z to x. For the input array, all
829	!> elements along z reside on the same PE, while after transposition, all
830	!> elements along x reside on the same PE.
831	!------------------------------------------------------------------------------!
832	SUBROUTINE transpose_zx( f_inv, f_out )
833
834
835	USE cpulog, &
836	ONLY: cpu_log, cpu_log_nowait, log_point_s
837
838	USE indices, &
839	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
840
841	USE kinds
842
843	USE pegrid
844
845	USE transpose_indices, &
846	ONLY: nyn_x, nys_x, nzb_x, nzt_x
847
848	IMPLICIT NONE
849
850	INTEGER(iwp) :: i !<
851	INTEGER(iwp) :: j !<
852	INTEGER(iwp) :: k !<
853	INTEGER(iwp) :: l !<
854	INTEGER(iwp) :: xs !<
855
856	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
857	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
858
859	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
860	#if __acc_fft_device
861	!$ACC DECLARE CREATE(work)
862	#endif
863
864
865	!
866	!-- If the PE grid is one-dimensional along y, only local reordering
867	!-- of the data is necessary and no transposition has to be done.
868	IF ( pdims(1) == 1 ) THEN
869
870	!$OMP PARALLEL PRIVATE ( i, j, k )
871	!$OMP DO
872	#if __acc_fft_device
873	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
874	!$ACC PRESENT(f_out, f_inv)
875	#endif
876	DO k = 1, nz
877	DO i = nxl, nxr
878	DO j = nys, nyn
879	f_out(i,j,k) = f_inv(j,i,k)
880	ENDDO
881	ENDDO
882	ENDDO
883	!$OMP END PARALLEL
884
885	ELSE
886
887	#if defined( __parallel )
888	!
889	!-- Transpose array
890	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
891
892	#if __acc_fft_device
893	#ifndef __cuda_aware_mpi
894	!$ACC UPDATE HOST(f_inv)
895	#else
896	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
897	#endif
898	#endif
899
900	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
901	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
902	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
903	comm1dx, ierr )
904
905	#if __acc_fft_device
906	#ifndef __cuda_aware_mpi
907	!$ACC UPDATE DEVICE(work)
908	#else
909	!$ACC END HOST_DATA
910	#endif
911	#endif
912
913	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
914
915	!
916	!-- Reorder transposed array
917	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
918	DO l = 0, pdims(1) - 1
919	xs = 0 + l * nnx
920	#if __acc_fft_device
921	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
922	!$ACC PRESENT(f_out, work)
923	#endif
924	!$OMP DO
925	DO k = nzb_x, nzt_x
926	DO i = xs, xs + nnx - 1
927	DO j = nys_x, nyn_x
928	f_out(i,j,k) = work(j,i-xs+1,k,l)
929	ENDDO
930	ENDDO
931	ENDDO
932	!$OMP END DO NOWAIT
933	ENDDO
934	!$OMP END PARALLEL
935	#endif
936
937	ENDIF
938
939	END SUBROUTINE transpose_zx
940
941
942	!------------------------------------------------------------------------------!
943	! Description:
944	! ------------
945	!> Resorting data after the transposition from z to y. The transposition itself
946	!> is carried out in transpose_zy
947	!------------------------------------------------------------------------------!
948	SUBROUTINE resort_for_zy( f_inv, f_out )
949
950
951	USE indices, &
952	ONLY: ny
953
954	USE kinds
955
956	USE transpose_indices, &
957	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
958
959	IMPLICIT NONE
960
961	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
962	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
963
964
965	INTEGER(iwp) :: i !<
966	INTEGER(iwp) :: j !<
967	INTEGER(iwp) :: k !<
968
969	!
970	!-- Rearrange indices of input array in order to make data to be send
971	!-- by MPI contiguous
972	!$OMP PARALLEL PRIVATE ( i, j, k )
973	!$OMP DO
974	#if __acc_fft_device
975	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
976	!$ACC PRESENT(f_out, f_inv)
977	#endif
978	DO k = nzb_y, nzt_y
979	DO i = nxl_y, nxr_y
980	DO j = 0, ny
981	f_out(j,i,k) = f_inv(i,k,j)
982	ENDDO
983	ENDDO
984	ENDDO
985	!$OMP END PARALLEL
986
987	END SUBROUTINE resort_for_zy
988
989
990	!------------------------------------------------------------------------------!
991	! Description:cpu_log_nowait
992	! ------------
993	!> Transposition of input array (f_in) from z to y. For the input array, all
994	!> elements along z reside on the same PE, while after transposition, all
995	!> elements along y reside on the same PE.
996	!------------------------------------------------------------------------------!
997	SUBROUTINE transpose_zy( f_in, f_inv )
998
999
1000	USE cpulog, &
1001	ONLY: cpu_log, cpu_log_nowait, log_point_s
1002
1003	USE indices, &
1004	ONLY: ny, nz
1005
1006	USE kinds
1007
1008	USE pegrid
1009
1010	USE transpose_indices, &
1011	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1012
1013	IMPLICIT NONE
1014
1015	INTEGER(iwp) :: i !<
1016	INTEGER(iwp) :: j !<
1017	INTEGER(iwp) :: k !<
1018	INTEGER(iwp) :: l !<
1019	INTEGER(iwp) :: zs !<
1020
1021	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1022	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1023
1024	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1025	#if __acc_fft_device
1026	!$ACC DECLARE CREATE(work)
1027	#endif
1028
1029	!
1030	!-- If the PE grid is one-dimensional along y, the array has only to be
1031	!-- reordered locally and therefore no transposition has to be done.
1032	IF ( pdims(1) /= 1 ) THEN
1033
1034	#if defined( __parallel )
1035	!
1036	!-- Reorder input array for transposition
1037	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1038	DO l = 0, pdims(1) - 1
1039	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1040	#if __acc_fft_device
1041	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1042	!$ACC PRESENT(work, f_in)
1043	#endif
1044	!$OMP DO
1045	DO j = nys_z, nyn_z
1046	DO k = zs, zs + nzt_y - nzb_y
1047	DO i = nxl_z, nxr_z
1048	work(i,k-zs+1,j,l) = f_in(i,j,k)
1049	ENDDO
1050	ENDDO
1051	ENDDO
1052	!$OMP END DO NOWAIT
1053	ENDDO
1054	!$OMP END PARALLEL
1055
1056	!
1057	!-- Transpose array
1058	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1059
1060	#if __acc_fft_device
1061	#ifndef __cuda_aware_mpi
1062	!$ACC UPDATE HOST(work)
1063	#else
1064	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1065	#endif
1066	#endif
1067
1068	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1069	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1070	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1071	comm1dx, ierr )
1072
1073	#if __acc_fft_device
1074	#ifndef __cuda_aware_mpi
1075	!$ACC UPDATE DEVICE(f_inv)
1076	#else
1077	!$ACC END HOST_DATA
1078	#endif
1079	#endif
1080
1081	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1082	#endif
1083
1084	ELSE
1085	!
1086	!-- Reorder the array in the same way like ALLTOALL did it
1087	!$OMP PARALLEL PRIVATE ( i, j, k )
1088	!$OMP DO
1089	#if __acc_fft_device
1090	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1091	!$ACC PRESENT(f_inv, f_in)
1092	#endif
1093	DO k = nzb_y, nzt_y
1094	DO j = 0, ny
1095	DO i = nxl_y, nxr_y
1096	f_inv(i,k,j) = f_in(i,j,k)
1097	ENDDO
1098	ENDDO
1099	ENDDO
1100	!$OMP END PARALLEL
1101
1102	ENDIF
1103
1104	END SUBROUTINE transpose_zy
1105
1106
1107	!------------------------------------------------------------------------------!
1108	! Description:
1109	! ------------
1110	!> Transposition of input array (f_in) from z to y. For the input array, all
1111	!> elements along z reside on the same PE, while after transposition, all
1112	!> elements along y reside on the same PE.
1113	!> This is a direct transposition for arrays with indices in regular order
1114	!> (k,j,i) (cf. transpose_zy).
1115	!------------------------------------------------------------------------------!
1116	SUBROUTINE transpose_zyd( f_in, f_out )
1117
1118
1119	USE cpulog, &
1120	ONLY: cpu_log, log_point_s
1121
1122	USE indices, &
1123	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1124
1125	USE kinds
1126
1127	USE pegrid
1128
1129	USE transpose_indices, &
1130	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1131
1132	IMPLICIT NONE
1133
1134	INTEGER(iwp) :: i !<
1135	INTEGER(iwp) :: j !<
1136	INTEGER(iwp) :: k !<
1137	INTEGER(iwp) :: l !<
1138	INTEGER(iwp) :: m !<
1139	INTEGER(iwp) :: ys !<
1140
1141	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1142	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1143	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1144	REAL(wp) :: work(nnxnnynnz) !<
1145
1146	#if defined( __parallel )
1147
1148	!
1149	!-- Rearrange indices of input array in order to make data to be send
1150	!-- by MPI contiguous
1151	DO i = nxl, nxr
1152	DO j = nys, nyn
1153	DO k = 1, nz
1154	f_inv(j,i,k) = f_in(k,j,i)
1155	ENDDO
1156	ENDDO
1157	ENDDO
1158
1159	!
1160	!-- Move data to different array, because memory location of work1 is
1161	!-- needed further below (work1 = work2).
1162	!-- If the PE grid is one-dimensional along x, only local reordering
1163	!-- of the data is necessary and no transposition has to be done.
1164	IF ( pdims(2) == 1 ) THEN
1165	DO k = 1, nz
1166	DO i = nxl, nxr
1167	DO j = nys, nyn
1168	f_out(j,i,k) = f_inv(j,i,k)
1169	ENDDO
1170	ENDDO
1171	ENDDO
1172	RETURN
1173	ENDIF
1174
1175	!
1176	!-- Transpose array
1177	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1178	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1179	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1180	work(1), sendrecvcount_zyd, MPI_REAL, &
1181	comm1dy, ierr )
1182	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1183
1184	!
1185	!-- Reorder transposed array
1186	m = 0
1187	DO l = 0, pdims(2) - 1
1188	ys = 0 + l * nny
1189	DO k = nzb_yd, nzt_yd
1190	DO i = nxl_yd, nxr_yd
1191	DO j = ys, ys + nny - 1
1192	m = m + 1
1193	f_out(j,i,k) = work(m)
1194	ENDDO
1195	ENDDO
1196	ENDDO
1197	ENDDO
1198
1199	#endif
1200
1201	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |