Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1682

Last change on this file since 1682 was 1682, checked in by knoop, 9 years ago
Code annotations made doxygen readable
Property svn:keywords set to `Id`
File size: 31.1 KB

Line
1	!> @file transpose.f90
2	!--------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms
6	! of the GNU General Public License as published by the Free Software Foundation,
7	! either version 3 of the License, or (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with
14	! PALM. If not, see <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2014 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------!
18	!
19	! Current revisions:
20	! -----------------
21	! Code annotations made doxygen readable
22	!
23	! Former revisions:
24	! -----------------
25	! $Id: transpose.f90 1682 2015-10-07 23:56:08Z knoop $
26	!
27	! 1324 2014-03-21 09:13:16Z suehring
28	! Bugfix: ONLY statement for module pegrid removed
29	!
30	! 1320 2014-03-20 08:40:49Z raasch
31	! ONLY-attribute added to USE-statements,
32	! kind-parameters added to all INTEGER and REAL declaration statements,
33	! kinds are defined in new module kinds,
34	! old module precision_kind is removed,
35	! revision history before 2012 removed,
36	! comment fields (!:) to be used for variable explanations added to
37	! all variable declaration statements
38	!
39	! 1318 2014-03-17 13:35:16Z raasch
40	! cpu_log_nowait parameter added to cpu measurements of the transpositions
41	! required for solving the Poisson equation (poisfft),
42	! module interfaces removed
43	!
44	! 1257 2013-11-08 15:18:40Z raasch
45	! openacc loop and loop vector clauses removed
46	!
47	! 1216 2013-08-26 09:31:42Z raasch
48	! re-sorting of the transposed / to be transposed arrays moved to separate
49	! routines resort_for_...
50	!
51	! 1111 2013-03-08 23:54:10Z raasch
52	! openACC directives added,
53	! resorting data from/to work changed, work got 4 dimensions instead of 1
54	!
55	! 1106 2013-03-04 05:31:38Z raasch
56	! preprocessor lines rearranged so that routines can also be used in serial
57	! (non-parallel) mode
58	!
59	! 1092 2013-02-02 11:24:22Z raasch
60	! unused variables removed
61	!
62	! 1036 2012-10-22 13:43:42Z raasch
63	! code put under GPL (PALM 3.9)
64	!
65	! 1003 2012-09-14 14:35:53Z raasch
66	! indices nxa, nya, etc. replaced by nx, ny, etc.
67	!
68	! Revision 1.1 1997/07/24 11:25:18 raasch
69	! Initial revision
70	!
71	!------------------------------------------------------------------------------!
72	! Description:
73	! ------------
74	!> Resorting data for the transposition from x to y. The transposition itself
75	!> is carried out in transpose_xy
76	!------------------------------------------------------------------------------!
77	SUBROUTINE resort_for_xy( f_in, f_inv )
78
79
80	USE indices, &
81	ONLY: nx
82
83	USE kinds
84
85	USE transpose_indices, &
86	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
87
88	IMPLICIT NONE
89
90	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
91	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
92
93
94	INTEGER(iwp) :: i !<
95	INTEGER(iwp) :: j !<
96	INTEGER(iwp) :: k !<
97	!
98	!-- Rearrange indices of input array in order to make data to be send
99	!-- by MPI contiguous
100	!$OMP PARALLEL PRIVATE ( i, j, k )
101	!$OMP DO
102	!$acc kernels present( f_in, f_inv )
103	DO i = 0, nx
104	DO k = nzb_x, nzt_x
105	DO j = nys_x, nyn_x
106	f_inv(j,k,i) = f_in(i,j,k)
107	ENDDO
108	ENDDO
109	ENDDO
110	!$acc end kernels
111	!$OMP END PARALLEL
112
113	END SUBROUTINE resort_for_xy
114
115
116	!------------------------------------------------------------------------------!
117	! Description:
118	! ------------
119	!> Transposition of input array (f_in) from x to y. For the input array, all
120	!> elements along x reside on the same PE, while after transposition, all
121	!> elements along y reside on the same PE.
122	!------------------------------------------------------------------------------!
123	SUBROUTINE transpose_xy( f_inv, f_out )
124
125
126	USE cpulog, &
127	ONLY: cpu_log, cpu_log_nowait, log_point_s
128
129	USE indices, &
130	ONLY: nx, ny
131
132	USE kinds
133
134	USE pegrid
135
136	USE transpose_indices, &
137	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
138
139	IMPLICIT NONE
140
141	INTEGER(iwp) :: i !<
142	INTEGER(iwp) :: j !<
143	INTEGER(iwp) :: k !<
144	INTEGER(iwp) :: l !<
145	INTEGER(iwp) :: ys !<
146
147	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
148	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
149
150	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
151
152
153	IF ( numprocs /= 1 ) THEN
154
155	#if defined( __parallel )
156	!
157	!-- Transpose array
158	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
159	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
160	!$acc update host( f_inv )
161	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
162	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
163	comm1dy, ierr )
164	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
165
166	!
167	!-- Reorder transposed array
168	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
169	!$OMP DO
170	!$acc data copyin( work )
171	DO l = 0, pdims(2) - 1
172	ys = 0 + l * ( nyn_x - nys_x + 1 )
173	!$acc kernels present( f_out, work )
174	DO i = nxl_y, nxr_y
175	DO k = nzb_y, nzt_y
176	DO j = ys, ys + nyn_x - nys_x
177	f_out(j,i,k) = work(j-ys+1,k,i,l)
178	ENDDO
179	ENDDO
180	ENDDO
181	!$acc end kernels
182	ENDDO
183	!$acc end data
184	!$OMP END PARALLEL
185	#endif
186
187	ELSE
188
189	!
190	!-- Reorder transposed array
191	!$OMP PARALLEL PRIVATE ( i, j, k )
192	!$OMP DO
193	!$acc kernels present( f_inv, f_out )
194	DO k = nzb_y, nzt_y
195	DO i = nxl_y, nxr_y
196	DO j = 0, ny
197	f_out(j,i,k) = f_inv(j,k,i)
198	ENDDO
199	ENDDO
200	ENDDO
201	!$acc end kernels
202	!$OMP END PARALLEL
203
204	ENDIF
205
206	END SUBROUTINE transpose_xy
207
208
209	!------------------------------------------------------------------------------!
210	! Description:
211	! ------------
212	!> Resorting data after the transposition from x to z. The transposition itself
213	!> is carried out in transpose_xz
214	!------------------------------------------------------------------------------!
215	SUBROUTINE resort_for_xz( f_inv, f_out )
216
217
218	USE indices, &
219	ONLY: nxl, nxr, nyn, nys, nz
220
221	USE kinds
222
223	IMPLICIT NONE
224
225	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
226	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
227
228	INTEGER(iwp) :: i !<
229	INTEGER(iwp) :: j !<
230	INTEGER(iwp) :: k !<
231	!
232	!-- Rearrange indices of input array in order to make data to be send
233	!-- by MPI contiguous.
234	!-- In case of parallel fft/transposition, scattered store is faster in
235	!-- backward direction!!!
236	!$OMP PARALLEL PRIVATE ( i, j, k )
237	!$OMP DO
238	!$acc kernels present( f_inv, f_out )
239	DO k = 1, nz
240	DO i = nxl, nxr
241	DO j = nys, nyn
242	f_out(k,j,i) = f_inv(j,i,k)
243	ENDDO
244	ENDDO
245	ENDDO
246	!$acc end kernels
247	!$OMP END PARALLEL
248
249	END SUBROUTINE resort_for_xz
250
251
252	!------------------------------------------------------------------------------!
253	! Description:
254	! ------------
255	!> Transposition of input array (f_in) from x to z. For the input array, all
256	!> elements along x reside on the same PE, while after transposition, all
257	!> elements along z reside on the same PE.
258	!------------------------------------------------------------------------------!
259	SUBROUTINE transpose_xz( f_in, f_inv )
260
261
262	USE cpulog, &
263	ONLY: cpu_log, cpu_log_nowait, log_point_s
264
265	USE indices, &
266	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
267
268	USE kinds
269
270	USE pegrid
271
272	USE transpose_indices, &
273	ONLY: nyn_x, nys_x, nzb_x, nzt_x
274
275	IMPLICIT NONE
276
277	INTEGER(iwp) :: i !<
278	INTEGER(iwp) :: j !<
279	INTEGER(iwp) :: k !<
280	INTEGER(iwp) :: l !<
281	INTEGER(iwp) :: xs !<
282
283	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
284	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
285
286	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
287
288
289	!
290	!-- If the PE grid is one-dimensional along y, the array has only to be
291	!-- reordered locally and therefore no transposition has to be done.
292	IF ( pdims(1) /= 1 ) THEN
293
294	#if defined( __parallel )
295	!
296	!-- Reorder input array for transposition
297	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
298	!$OMP DO
299	!$acc data copyout( work )
300	DO l = 0, pdims(1) - 1
301	xs = 0 + l * nnx
302	!$acc kernels present( f_in, work )
303	DO k = nzb_x, nzt_x
304	DO i = xs, xs + nnx - 1
305	DO j = nys_x, nyn_x
306	work(j,i-xs+1,k,l) = f_in(i,j,k)
307	ENDDO
308	ENDDO
309	ENDDO
310	!$acc end kernels
311	ENDDO
312	!$acc end data
313	!$OMP END PARALLEL
314
315	!
316	!-- Transpose array
317	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
318	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
319	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
320	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
321	comm1dx, ierr )
322	!$acc update device( f_inv )
323	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
324	#endif
325
326	ELSE
327
328	!
329	!-- Reorder the array in a way that the z index is in first position
330	!$OMP PARALLEL PRIVATE ( i, j, k )
331	!$OMP DO
332	!$acc kernels present( f_in, f_inv )
333	DO i = nxl, nxr
334	DO j = nys, nyn
335	DO k = 1, nz
336	f_inv(j,i,k) = f_in(i,j,k)
337	ENDDO
338	ENDDO
339	ENDDO
340	!$acc end kernels
341	!$OMP END PARALLEL
342
343	ENDIF
344
345	END SUBROUTINE transpose_xz
346
347
348	!------------------------------------------------------------------------------!
349	! Description:
350	! ------------
351	!> Resorting data after the transposition from y to x. The transposition itself
352	!> is carried out in transpose_yx
353	!------------------------------------------------------------------------------!
354	SUBROUTINE resort_for_yx( f_inv, f_out )
355
356
357	USE indices, &
358	ONLY: nx
359
360	USE kinds
361
362	USE transpose_indices, &
363	ONLY: nyn_x, nys_x, nzb_x, nzt_x
364
365	IMPLICIT NONE
366
367	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
368	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
369
370
371	INTEGER(iwp) :: i !<
372	INTEGER(iwp) :: j !<
373	INTEGER(iwp) :: k !<
374	!
375	!-- Rearrange indices of input array in order to make data to be send
376	!-- by MPI contiguous
377	!$OMP PARALLEL PRIVATE ( i, j, k )
378	!$OMP DO
379	!$acc kernels present( f_inv, f_out )
380	DO i = 0, nx
381	DO k = nzb_x, nzt_x
382	DO j = nys_x, nyn_x
383	f_out(i,j,k) = f_inv(j,k,i)
384	ENDDO
385	ENDDO
386	ENDDO
387	!$acc end kernels
388	!$OMP END PARALLEL
389
390	END SUBROUTINE resort_for_yx
391
392
393	!------------------------------------------------------------------------------!
394	! Description:
395	! ------------
396	!> Transposition of input array (f_in) from y to x. For the input array, all
397	!> elements along y reside on the same PE, while after transposition, all
398	!> elements along x reside on the same PE.
399	!------------------------------------------------------------------------------!
400	SUBROUTINE transpose_yx( f_in, f_inv )
401
402
403	USE cpulog, &
404	ONLY: cpu_log, cpu_log_nowait, log_point_s
405
406	USE indices, &
407	ONLY: nx, ny
408
409	USE kinds
410
411	USE pegrid
412
413	USE transpose_indices, &
414	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
415
416	IMPLICIT NONE
417
418	INTEGER(iwp) :: i !<
419	INTEGER(iwp) :: j !<
420	INTEGER(iwp) :: k !<
421	INTEGER(iwp) :: l !<
422	INTEGER(iwp) :: ys !<
423
424	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
425	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
426
427	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
428
429
430	IF ( numprocs /= 1 ) THEN
431
432	#if defined( __parallel )
433	!
434	!-- Reorder input array for transposition
435	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
436	!$OMP DO
437	!$acc data copyout( work )
438	DO l = 0, pdims(2) - 1
439	ys = 0 + l * ( nyn_x - nys_x + 1 )
440	!$acc kernels present( f_in, work )
441	DO i = nxl_y, nxr_y
442	DO k = nzb_y, nzt_y
443	DO j = ys, ys + nyn_x - nys_x
444	work(j-ys+1,k,i,l) = f_in(j,i,k)
445	ENDDO
446	ENDDO
447	ENDDO
448	!$acc end kernels
449	ENDDO
450	!$acc end data
451	!$OMP END PARALLEL
452
453	!
454	!-- Transpose array
455	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
456	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
457	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
458	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
459	comm1dy, ierr )
460	!$acc update device( f_inv )
461	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
462	#endif
463
464	ELSE
465
466	!
467	!-- Reorder array f_in the same way as ALLTOALL did it
468	!$OMP PARALLEL PRIVATE ( i, j, k )
469	!$OMP DO
470	!$acc kernels present( f_in, f_inv )
471	DO i = nxl_y, nxr_y
472	DO k = nzb_y, nzt_y
473	DO j = 0, ny
474	f_inv(j,k,i) = f_in(j,i,k)
475	ENDDO
476	ENDDO
477	ENDDO
478	!$acc end kernels
479	!$OMP END PARALLEL
480
481	ENDIF
482
483	END SUBROUTINE transpose_yx
484
485
486	!------------------------------------------------------------------------------!
487	! Description:
488	! ------------
489	!> Transposition of input array (f_in) from y to x. For the input array, all
490	!> elements along y reside on the same PE, while after transposition, all
491	!> elements along x reside on the same PE.
492	!> This is a direct transposition for arrays with indices in regular order
493	!> (k,j,i) (cf. transpose_yx).
494	!------------------------------------------------------------------------------!
495	SUBROUTINE transpose_yxd( f_in, f_out )
496
497
498	USE cpulog, &
499	ONLY: cpu_log, cpu_log_nowait, log_point_s
500
501	USE indices, &
502	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
503
504	USE kinds
505
506	USE pegrid
507
508	USE transpose_indices, &
509	ONLY: nyn_x, nys_x, nzb_x, nzt_x
510
511	IMPLICIT NONE
512
513	INTEGER(iwp) :: i !<
514	INTEGER(iwp) :: j !<
515	INTEGER(iwp) :: k !<
516	INTEGER(iwp) :: l !<
517	INTEGER(iwp) :: m !<
518	INTEGER(iwp) :: xs !<
519
520	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
521	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
522	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
523	REAL(wp) :: work(nnxnnynnz) !<
524	#if defined( __parallel )
525
526	!
527	!-- Rearrange indices of input array in order to make data to be send
528	!-- by MPI contiguous
529	DO k = 1, nz
530	DO j = nys, nyn
531	DO i = nxl, nxr
532	f_inv(i,k,j) = f_in(k,j,i)
533	ENDDO
534	ENDDO
535	ENDDO
536
537	!
538	!-- Transpose array
539	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
540	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
541	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
542	work(1), sendrecvcount_xy, MPI_REAL, &
543	comm1dx, ierr )
544	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
545
546	!
547	!-- Reorder transposed array
548	m = 0
549	DO l = 0, pdims(1) - 1
550	xs = 0 + l * nnx
551	DO j = nys_x, nyn_x
552	DO k = 1, nz
553	DO i = xs, xs + nnx - 1
554	m = m + 1
555	f_out(i,j,k) = work(m)
556	ENDDO
557	ENDDO
558	ENDDO
559	ENDDO
560
561	#endif
562
563	END SUBROUTINE transpose_yxd
564
565
566	!------------------------------------------------------------------------------!
567	! Description:
568	! ------------
569	!> Resorting data for the transposition from y to z. The transposition itself
570	!> is carried out in transpose_yz
571	!------------------------------------------------------------------------------!
572	SUBROUTINE resort_for_yz( f_in, f_inv )
573
574
575	USE indices, &
576	ONLY: ny
577
578	USE kinds
579
580	USE transpose_indices, &
581	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
582
583	IMPLICIT NONE
584
585	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
586	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
587
588	INTEGER(iwp) :: i !<
589	INTEGER(iwp) :: j !<
590	INTEGER(iwp) :: k !<
591
592	!
593	!-- Rearrange indices of input array in order to make data to be send
594	!-- by MPI contiguous
595	!$OMP PARALLEL PRIVATE ( i, j, k )
596	!$OMP DO
597	!$acc kernels present( f_in, f_inv )
598	DO j = 0, ny
599	DO k = nzb_y, nzt_y
600	DO i = nxl_y, nxr_y
601	f_inv(i,k,j) = f_in(j,i,k)
602	ENDDO
603	ENDDO
604	ENDDO
605	!$acc end kernels
606	!$OMP END PARALLEL
607
608	END SUBROUTINE resort_for_yz
609
610
611	!------------------------------------------------------------------------------!
612	! Description:
613	! ------------
614	!> Transposition of input array (f_in) from y to z. For the input array, all
615	!> elements along y reside on the same PE, while after transposition, all
616	!> elements along z reside on the same PE.
617	!------------------------------------------------------------------------------!
618	SUBROUTINE transpose_yz( f_inv, f_out )
619
620
621	USE cpulog, &
622	ONLY: cpu_log, cpu_log_nowait, log_point_s
623
624	USE indices, &
625	ONLY: ny, nz
626
627	USE kinds
628
629	USE pegrid
630
631	USE transpose_indices, &
632	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
633
634	IMPLICIT NONE
635
636	INTEGER(iwp) :: i !<
637	INTEGER(iwp) :: j !<
638	INTEGER(iwp) :: k !<
639	INTEGER(iwp) :: l !<
640	INTEGER(iwp) :: zs !<
641
642	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
643	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
644
645	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
646
647
648	!
649	!-- If the PE grid is one-dimensional along y, only local reordering
650	!-- of the data is necessary and no transposition has to be done.
651	IF ( pdims(1) == 1 ) THEN
652
653	!$OMP PARALLEL PRIVATE ( i, j, k )
654	!$OMP DO
655	!$acc kernels present( f_inv, f_out )
656	DO j = 0, ny
657	DO k = nzb_y, nzt_y
658	DO i = nxl_y, nxr_y
659	f_out(i,j,k) = f_inv(i,k,j)
660	ENDDO
661	ENDDO
662	ENDDO
663	!$acc end kernels
664	!$OMP END PARALLEL
665
666	ELSE
667
668	#if defined( __parallel )
669	!
670	!-- Transpose array
671	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
672	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
673	!$acc update host( f_inv )
674	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
675	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
676	comm1dx, ierr )
677	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
678
679	!
680	!-- Reorder transposed array
681	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
682	!$OMP DO
683	!$acc data copyin( work )
684	DO l = 0, pdims(1) - 1
685	zs = 1 + l * ( nzt_y - nzb_y + 1 )
686	!$acc kernels present( f_out )
687	DO j = nys_z, nyn_z
688	DO k = zs, zs + nzt_y - nzb_y
689	DO i = nxl_z, nxr_z
690	f_out(i,j,k) = work(i,k-zs+1,j,l)
691	ENDDO
692	ENDDO
693	ENDDO
694	!$acc end kernels
695	ENDDO
696	!$acc end data
697	!$OMP END PARALLEL
698	#endif
699
700	ENDIF
701
702	END SUBROUTINE transpose_yz
703
704
705	!------------------------------------------------------------------------------!
706	! Description:
707	! ------------
708	!> Resorting data for the transposition from z to x. The transposition itself
709	!> is carried out in transpose_zx
710	!------------------------------------------------------------------------------!
711	SUBROUTINE resort_for_zx( f_in, f_inv )
712
713
714	USE indices, &
715	ONLY: nxl, nxr, nyn, nys, nz
716
717	USE kinds
718
719	IMPLICIT NONE
720
721	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
722	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
723
724	INTEGER(iwp) :: i !<
725	INTEGER(iwp) :: j !<
726	INTEGER(iwp) :: k !<
727
728	!
729	!-- Rearrange indices of input array in order to make data to be send
730	!-- by MPI contiguous
731	!$OMP PARALLEL PRIVATE ( i, j, k )
732	!$OMP DO
733	!$acc kernels present( f_in, f_inv )
734	DO k = 1,nz
735	DO i = nxl, nxr
736	DO j = nys, nyn
737	f_inv(j,i,k) = f_in(k,j,i)
738	ENDDO
739	ENDDO
740	ENDDO
741	!$acc end kernels
742	!$OMP END PARALLEL
743
744	END SUBROUTINE resort_for_zx
745
746
747	!------------------------------------------------------------------------------!
748	! Description:
749	! ------------
750	!> Transposition of input array (f_in) from z to x. For the input array, all
751	!> elements along z reside on the same PE, while after transposition, all
752	!> elements along x reside on the same PE.
753	!------------------------------------------------------------------------------!
754	SUBROUTINE transpose_zx( f_inv, f_out )
755
756
757	USE cpulog, &
758	ONLY: cpu_log, cpu_log_nowait, log_point_s
759
760	USE indices, &
761	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
762
763	USE kinds
764
765	USE pegrid
766
767	USE transpose_indices, &
768	ONLY: nyn_x, nys_x, nzb_x, nzt_x
769
770	IMPLICIT NONE
771
772	INTEGER(iwp) :: i !<
773	INTEGER(iwp) :: j !<
774	INTEGER(iwp) :: k !<
775	INTEGER(iwp) :: l !<
776	INTEGER(iwp) :: xs !<
777
778	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
779	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
780
781	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
782
783
784	!
785	!-- If the PE grid is one-dimensional along y, only local reordering
786	!-- of the data is necessary and no transposition has to be done.
787	IF ( pdims(1) == 1 ) THEN
788
789	!$OMP PARALLEL PRIVATE ( i, j, k )
790	!$OMP DO
791	!$acc kernels present( f_inv, f_out )
792	DO k = 1, nz
793	DO i = nxl, nxr
794	DO j = nys, nyn
795	f_out(i,j,k) = f_inv(j,i,k)
796	ENDDO
797	ENDDO
798	ENDDO
799	!$acc end kernels
800	!$OMP END PARALLEL
801
802	ELSE
803
804	#if defined( __parallel )
805	!
806	!-- Transpose array
807	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
808	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
809	!$acc update host( f_inv )
810	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
811	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
812	comm1dx, ierr )
813	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
814
815	!
816	!-- Reorder transposed array
817	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
818	!$OMP DO
819	!$acc data copyin( work )
820	DO l = 0, pdims(1) - 1
821	xs = 0 + l * nnx
822	!$acc kernels present( f_out )
823	DO k = nzb_x, nzt_x
824	DO i = xs, xs + nnx - 1
825	DO j = nys_x, nyn_x
826	f_out(i,j,k) = work(j,i-xs+1,k,l)
827	ENDDO
828	ENDDO
829	ENDDO
830	!$acc end kernels
831	ENDDO
832	!$acc end data
833	!$OMP END PARALLEL
834	#endif
835
836	ENDIF
837
838	END SUBROUTINE transpose_zx
839
840
841	!------------------------------------------------------------------------------!
842	! Description:
843	! ------------
844	!> Resorting data after the transposition from z to y. The transposition itself
845	!> is carried out in transpose_zy
846	!------------------------------------------------------------------------------!
847	SUBROUTINE resort_for_zy( f_inv, f_out )
848
849
850	USE indices, &
851	ONLY: ny
852
853	USE kinds
854
855	USE transpose_indices, &
856	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
857
858	IMPLICIT NONE
859
860	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
861	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
862
863
864	INTEGER(iwp) :: i !<
865	INTEGER(iwp) :: j !<
866	INTEGER(iwp) :: k !<
867
868	!
869	!-- Rearrange indices of input array in order to make data to be send
870	!-- by MPI contiguous
871	!$OMP PARALLEL PRIVATE ( i, j, k )
872	!$OMP DO
873	!$acc kernels present( f_inv, f_out )
874	DO k = nzb_y, nzt_y
875	DO j = 0, ny
876	DO i = nxl_y, nxr_y
877	f_out(j,i,k) = f_inv(i,k,j)
878	ENDDO
879	ENDDO
880	ENDDO
881	!$acc end kernels
882	!$OMP END PARALLEL
883
884	END SUBROUTINE resort_for_zy
885
886
887	!------------------------------------------------------------------------------!
888	! Description:
889	! ------------
890	!> Transposition of input array (f_in) from z to y. For the input array, all
891	!> elements along z reside on the same PE, while after transposition, all
892	!> elements along y reside on the same PE.
893	!------------------------------------------------------------------------------!
894	SUBROUTINE transpose_zy( f_in, f_inv )
895
896
897	USE cpulog, &
898	ONLY: cpu_log, cpu_log_nowait, log_point_s
899
900	USE indices, &
901	ONLY: ny, nz
902
903	USE kinds
904
905	USE pegrid
906
907	USE transpose_indices, &
908	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
909
910	IMPLICIT NONE
911
912	INTEGER(iwp) :: i !<
913	INTEGER(iwp) :: j !<
914	INTEGER(iwp) :: k !<
915	INTEGER(iwp) :: l !<
916	INTEGER(iwp) :: zs !<
917
918	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
919	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
920
921	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
922
923	!
924	!-- If the PE grid is one-dimensional along y, the array has only to be
925	!-- reordered locally and therefore no transposition has to be done.
926	IF ( pdims(1) /= 1 ) THEN
927
928	#if defined( __parallel )
929	!
930	!-- Reorder input array for transposition
931	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
932	!$OMP DO
933	!$acc data copyout( work )
934	DO l = 0, pdims(1) - 1
935	zs = 1 + l * ( nzt_y - nzb_y + 1 )
936	!$acc kernels present( f_in, work )
937	DO j = nys_z, nyn_z
938	DO k = zs, zs + nzt_y - nzb_y
939	DO i = nxl_z, nxr_z
940	work(i,k-zs+1,j,l) = f_in(i,j,k)
941	ENDDO
942	ENDDO
943	ENDDO
944	!$acc end kernels
945	ENDDO
946	!$acc end data
947	!$OMP END PARALLEL
948
949	!
950	!-- Transpose array
951	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
952	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
953	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
954	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
955	comm1dx, ierr )
956	!$acc update device( f_inv )
957	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
958	#endif
959
960	ELSE
961	!
962	!-- Reorder the array in the same way like ALLTOALL did it
963	!$OMP PARALLEL PRIVATE ( i, j, k )
964	!$OMP DO
965	!$acc kernels present( f_in, f_inv )
966	DO k = nzb_y, nzt_y
967	DO j = 0, ny
968	DO i = nxl_y, nxr_y
969	f_inv(i,k,j) = f_in(i,j,k)
970	ENDDO
971	ENDDO
972	ENDDO
973	!$acc end kernels
974	!$OMP END PARALLEL
975
976	ENDIF
977
978	END SUBROUTINE transpose_zy
979
980
981	!------------------------------------------------------------------------------!
982	! Description:
983	! ------------
984	!> Transposition of input array (f_in) from z to y. For the input array, all
985	!> elements along z reside on the same PE, while after transposition, all
986	!> elements along y reside on the same PE.
987	!> This is a direct transposition for arrays with indices in regular order
988	!> (k,j,i) (cf. transpose_zy).
989	!------------------------------------------------------------------------------!
990	SUBROUTINE transpose_zyd( f_in, f_out )
991
992
993	USE cpulog, &
994	ONLY: cpu_log, cpu_log_nowait, log_point_s
995
996	USE indices, &
997	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
998
999	USE kinds
1000
1001	USE pegrid
1002
1003	USE transpose_indices, &
1004	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
1005
1006	IMPLICIT NONE
1007
1008	INTEGER(iwp) :: i !<
1009	INTEGER(iwp) :: j !<
1010	INTEGER(iwp) :: k !<
1011	INTEGER(iwp) :: l !<
1012	INTEGER(iwp) :: m !<
1013	INTEGER(iwp) :: ys !<
1014
1015	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1016	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1017	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1018	REAL(wp) :: work(nnxnnynnz) !<
1019
1020	#if defined( __parallel )
1021
1022	!
1023	!-- Rearrange indices of input array in order to make data to be send
1024	!-- by MPI contiguous
1025	DO i = nxl, nxr
1026	DO j = nys, nyn
1027	DO k = 1, nz
1028	f_inv(j,i,k) = f_in(k,j,i)
1029	ENDDO
1030	ENDDO
1031	ENDDO
1032
1033	!
1034	!-- Move data to different array, because memory location of work1 is
1035	!-- needed further below (work1 = work2).
1036	!-- If the PE grid is one-dimensional along x, only local reordering
1037	!-- of the data is necessary and no transposition has to be done.
1038	IF ( pdims(2) == 1 ) THEN
1039	DO k = 1, nz
1040	DO i = nxl, nxr
1041	DO j = nys, nyn
1042	f_out(j,i,k) = f_inv(j,i,k)
1043	ENDDO
1044	ENDDO
1045	ENDDO
1046	RETURN
1047	ENDIF
1048
1049	!
1050	!-- Transpose array
1051	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1052	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1053	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1054	work(1), sendrecvcount_zyd, MPI_REAL, &
1055	comm1dy, ierr )
1056	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1057
1058	!
1059	!-- Reorder transposed array
1060	m = 0
1061	DO l = 0, pdims(2) - 1
1062	ys = 0 + l * nny
1063	DO k = nzb_yd, nzt_yd
1064	DO i = nxl_yd, nxr_yd
1065	DO j = ys, ys + nny - 1
1066	m = m + 1
1067	f_out(j,i,k) = work(m)
1068	ENDDO
1069	ENDDO
1070	ENDDO
1071	ENDDO
1072
1073	#endif
1074
1075	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |