Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1927

Last change on this file since 1927 was 1818, checked in by maronga, 9 years ago
last commit documented / copyright update
Property svn:keywords set to `Id`
File size: 31.2 KB

Line
1	!> @file transpose.f90
2	!--------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms
6	! of the GNU General Public License as published by the Free Software Foundation,
7	! either version 3 of the License, or (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with
14	! PALM. If not, see <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2016 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------!
18	!
19	! Current revisions:
20	! -----------------
21	!
22	!
23	! Former revisions:
24	! -----------------
25	! $Id: transpose.f90 1818 2016-04-06 15:53:27Z hellstea $
26	!
27	! 1682 2015-10-07 23:56:08Z knoop
28	! Code annotations made doxygen readable
29	!
30	! 1324 2014-03-21 09:13:16Z suehring
31	! Bugfix: ONLY statement for module pegrid removed
32	!
33	! 1320 2014-03-20 08:40:49Z raasch
34	! ONLY-attribute added to USE-statements,
35	! kind-parameters added to all INTEGER and REAL declaration statements,
36	! kinds are defined in new module kinds,
37	! old module precision_kind is removed,
38	! revision history before 2012 removed,
39	! comment fields (!:) to be used for variable explanations added to
40	! all variable declaration statements
41	!
42	! 1318 2014-03-17 13:35:16Z raasch
43	! cpu_log_nowait parameter added to cpu measurements of the transpositions
44	! required for solving the Poisson equation (poisfft),
45	! module interfaces removed
46	!
47	! 1257 2013-11-08 15:18:40Z raasch
48	! openacc loop and loop vector clauses removed
49	!
50	! 1216 2013-08-26 09:31:42Z raasch
51	! re-sorting of the transposed / to be transposed arrays moved to separate
52	! routines resort_for_...
53	!
54	! 1111 2013-03-08 23:54:10Z raasch
55	! openACC directives added,
56	! resorting data from/to work changed, work got 4 dimensions instead of 1
57	!
58	! 1106 2013-03-04 05:31:38Z raasch
59	! preprocessor lines rearranged so that routines can also be used in serial
60	! (non-parallel) mode
61	!
62	! 1092 2013-02-02 11:24:22Z raasch
63	! unused variables removed
64	!
65	! 1036 2012-10-22 13:43:42Z raasch
66	! code put under GPL (PALM 3.9)
67	!
68	! 1003 2012-09-14 14:35:53Z raasch
69	! indices nxa, nya, etc. replaced by nx, ny, etc.
70	!
71	! Revision 1.1 1997/07/24 11:25:18 raasch
72	! Initial revision
73	!
74	!------------------------------------------------------------------------------!
75	! Description:
76	! ------------
77	!> Resorting data for the transposition from x to y. The transposition itself
78	!> is carried out in transpose_xy
79	!------------------------------------------------------------------------------!
80	SUBROUTINE resort_for_xy( f_in, f_inv )
81
82
83	USE indices, &
84	ONLY: nx
85
86	USE kinds
87
88	USE transpose_indices, &
89	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
90
91	IMPLICIT NONE
92
93	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
94	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
95
96
97	INTEGER(iwp) :: i !<
98	INTEGER(iwp) :: j !<
99	INTEGER(iwp) :: k !<
100	!
101	!-- Rearrange indices of input array in order to make data to be send
102	!-- by MPI contiguous
103	!$OMP PARALLEL PRIVATE ( i, j, k )
104	!$OMP DO
105	!$acc kernels present( f_in, f_inv )
106	DO i = 0, nx
107	DO k = nzb_x, nzt_x
108	DO j = nys_x, nyn_x
109	f_inv(j,k,i) = f_in(i,j,k)
110	ENDDO
111	ENDDO
112	ENDDO
113	!$acc end kernels
114	!$OMP END PARALLEL
115
116	END SUBROUTINE resort_for_xy
117
118
119	!------------------------------------------------------------------------------!
120	! Description:
121	! ------------
122	!> Transposition of input array (f_in) from x to y. For the input array, all
123	!> elements along x reside on the same PE, while after transposition, all
124	!> elements along y reside on the same PE.
125	!------------------------------------------------------------------------------!
126	SUBROUTINE transpose_xy( f_inv, f_out )
127
128
129	USE cpulog, &
130	ONLY: cpu_log, cpu_log_nowait, log_point_s
131
132	USE indices, &
133	ONLY: nx, ny
134
135	USE kinds
136
137	USE pegrid
138
139	USE transpose_indices, &
140	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
141
142	IMPLICIT NONE
143
144	INTEGER(iwp) :: i !<
145	INTEGER(iwp) :: j !<
146	INTEGER(iwp) :: k !<
147	INTEGER(iwp) :: l !<
148	INTEGER(iwp) :: ys !<
149
150	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
151	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
152
153	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
154
155
156	IF ( numprocs /= 1 ) THEN
157
158	#if defined( __parallel )
159	!
160	!-- Transpose array
161	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
162	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
163	!$acc update host( f_inv )
164	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
165	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
166	comm1dy, ierr )
167	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
168
169	!
170	!-- Reorder transposed array
171	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
172	!$OMP DO
173	!$acc data copyin( work )
174	DO l = 0, pdims(2) - 1
175	ys = 0 + l * ( nyn_x - nys_x + 1 )
176	!$acc kernels present( f_out, work )
177	DO i = nxl_y, nxr_y
178	DO k = nzb_y, nzt_y
179	DO j = ys, ys + nyn_x - nys_x
180	f_out(j,i,k) = work(j-ys+1,k,i,l)
181	ENDDO
182	ENDDO
183	ENDDO
184	!$acc end kernels
185	ENDDO
186	!$acc end data
187	!$OMP END PARALLEL
188	#endif
189
190	ELSE
191
192	!
193	!-- Reorder transposed array
194	!$OMP PARALLEL PRIVATE ( i, j, k )
195	!$OMP DO
196	!$acc kernels present( f_inv, f_out )
197	DO k = nzb_y, nzt_y
198	DO i = nxl_y, nxr_y
199	DO j = 0, ny
200	f_out(j,i,k) = f_inv(j,k,i)
201	ENDDO
202	ENDDO
203	ENDDO
204	!$acc end kernels
205	!$OMP END PARALLEL
206
207	ENDIF
208
209	END SUBROUTINE transpose_xy
210
211
212	!------------------------------------------------------------------------------!
213	! Description:
214	! ------------
215	!> Resorting data after the transposition from x to z. The transposition itself
216	!> is carried out in transpose_xz
217	!------------------------------------------------------------------------------!
218	SUBROUTINE resort_for_xz( f_inv, f_out )
219
220
221	USE indices, &
222	ONLY: nxl, nxr, nyn, nys, nz
223
224	USE kinds
225
226	IMPLICIT NONE
227
228	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
229	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
230
231	INTEGER(iwp) :: i !<
232	INTEGER(iwp) :: j !<
233	INTEGER(iwp) :: k !<
234	!
235	!-- Rearrange indices of input array in order to make data to be send
236	!-- by MPI contiguous.
237	!-- In case of parallel fft/transposition, scattered store is faster in
238	!-- backward direction!!!
239	!$OMP PARALLEL PRIVATE ( i, j, k )
240	!$OMP DO
241	!$acc kernels present( f_inv, f_out )
242	DO k = 1, nz
243	DO i = nxl, nxr
244	DO j = nys, nyn
245	f_out(k,j,i) = f_inv(j,i,k)
246	ENDDO
247	ENDDO
248	ENDDO
249	!$acc end kernels
250	!$OMP END PARALLEL
251
252	END SUBROUTINE resort_for_xz
253
254
255	!------------------------------------------------------------------------------!
256	! Description:
257	! ------------
258	!> Transposition of input array (f_in) from x to z. For the input array, all
259	!> elements along x reside on the same PE, while after transposition, all
260	!> elements along z reside on the same PE.
261	!------------------------------------------------------------------------------!
262	SUBROUTINE transpose_xz( f_in, f_inv )
263
264
265	USE cpulog, &
266	ONLY: cpu_log, cpu_log_nowait, log_point_s
267
268	USE indices, &
269	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
270
271	USE kinds
272
273	USE pegrid
274
275	USE transpose_indices, &
276	ONLY: nyn_x, nys_x, nzb_x, nzt_x
277
278	IMPLICIT NONE
279
280	INTEGER(iwp) :: i !<
281	INTEGER(iwp) :: j !<
282	INTEGER(iwp) :: k !<
283	INTEGER(iwp) :: l !<
284	INTEGER(iwp) :: xs !<
285
286	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
287	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
288
289	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
290
291
292	!
293	!-- If the PE grid is one-dimensional along y, the array has only to be
294	!-- reordered locally and therefore no transposition has to be done.
295	IF ( pdims(1) /= 1 ) THEN
296
297	#if defined( __parallel )
298	!
299	!-- Reorder input array for transposition
300	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
301	!$OMP DO
302	!$acc data copyout( work )
303	DO l = 0, pdims(1) - 1
304	xs = 0 + l * nnx
305	!$acc kernels present( f_in, work )
306	DO k = nzb_x, nzt_x
307	DO i = xs, xs + nnx - 1
308	DO j = nys_x, nyn_x
309	work(j,i-xs+1,k,l) = f_in(i,j,k)
310	ENDDO
311	ENDDO
312	ENDDO
313	!$acc end kernels
314	ENDDO
315	!$acc end data
316	!$OMP END PARALLEL
317
318	!
319	!-- Transpose array
320	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
321	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
322	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
323	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
324	comm1dx, ierr )
325	!$acc update device( f_inv )
326	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
327	#endif
328
329	ELSE
330
331	!
332	!-- Reorder the array in a way that the z index is in first position
333	!$OMP PARALLEL PRIVATE ( i, j, k )
334	!$OMP DO
335	!$acc kernels present( f_in, f_inv )
336	DO i = nxl, nxr
337	DO j = nys, nyn
338	DO k = 1, nz
339	f_inv(j,i,k) = f_in(i,j,k)
340	ENDDO
341	ENDDO
342	ENDDO
343	!$acc end kernels
344	!$OMP END PARALLEL
345
346	ENDIF
347
348	END SUBROUTINE transpose_xz
349
350
351	!------------------------------------------------------------------------------!
352	! Description:
353	! ------------
354	!> Resorting data after the transposition from y to x. The transposition itself
355	!> is carried out in transpose_yx
356	!------------------------------------------------------------------------------!
357	SUBROUTINE resort_for_yx( f_inv, f_out )
358
359
360	USE indices, &
361	ONLY: nx
362
363	USE kinds
364
365	USE transpose_indices, &
366	ONLY: nyn_x, nys_x, nzb_x, nzt_x
367
368	IMPLICIT NONE
369
370	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
371	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
372
373
374	INTEGER(iwp) :: i !<
375	INTEGER(iwp) :: j !<
376	INTEGER(iwp) :: k !<
377	!
378	!-- Rearrange indices of input array in order to make data to be send
379	!-- by MPI contiguous
380	!$OMP PARALLEL PRIVATE ( i, j, k )
381	!$OMP DO
382	!$acc kernels present( f_inv, f_out )
383	DO i = 0, nx
384	DO k = nzb_x, nzt_x
385	DO j = nys_x, nyn_x
386	f_out(i,j,k) = f_inv(j,k,i)
387	ENDDO
388	ENDDO
389	ENDDO
390	!$acc end kernels
391	!$OMP END PARALLEL
392
393	END SUBROUTINE resort_for_yx
394
395
396	!------------------------------------------------------------------------------!
397	! Description:
398	! ------------
399	!> Transposition of input array (f_in) from y to x. For the input array, all
400	!> elements along y reside on the same PE, while after transposition, all
401	!> elements along x reside on the same PE.
402	!------------------------------------------------------------------------------!
403	SUBROUTINE transpose_yx( f_in, f_inv )
404
405
406	USE cpulog, &
407	ONLY: cpu_log, cpu_log_nowait, log_point_s
408
409	USE indices, &
410	ONLY: nx, ny
411
412	USE kinds
413
414	USE pegrid
415
416	USE transpose_indices, &
417	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
418
419	IMPLICIT NONE
420
421	INTEGER(iwp) :: i !<
422	INTEGER(iwp) :: j !<
423	INTEGER(iwp) :: k !<
424	INTEGER(iwp) :: l !<
425	INTEGER(iwp) :: ys !<
426
427	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
428	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
429
430	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
431
432
433	IF ( numprocs /= 1 ) THEN
434
435	#if defined( __parallel )
436	!
437	!-- Reorder input array for transposition
438	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
439	!$OMP DO
440	!$acc data copyout( work )
441	DO l = 0, pdims(2) - 1
442	ys = 0 + l * ( nyn_x - nys_x + 1 )
443	!$acc kernels present( f_in, work )
444	DO i = nxl_y, nxr_y
445	DO k = nzb_y, nzt_y
446	DO j = ys, ys + nyn_x - nys_x
447	work(j-ys+1,k,i,l) = f_in(j,i,k)
448	ENDDO
449	ENDDO
450	ENDDO
451	!$acc end kernels
452	ENDDO
453	!$acc end data
454	!$OMP END PARALLEL
455
456	!
457	!-- Transpose array
458	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
459	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
460	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
461	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
462	comm1dy, ierr )
463	!$acc update device( f_inv )
464	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
465	#endif
466
467	ELSE
468
469	!
470	!-- Reorder array f_in the same way as ALLTOALL did it
471	!$OMP PARALLEL PRIVATE ( i, j, k )
472	!$OMP DO
473	!$acc kernels present( f_in, f_inv )
474	DO i = nxl_y, nxr_y
475	DO k = nzb_y, nzt_y
476	DO j = 0, ny
477	f_inv(j,k,i) = f_in(j,i,k)
478	ENDDO
479	ENDDO
480	ENDDO
481	!$acc end kernels
482	!$OMP END PARALLEL
483
484	ENDIF
485
486	END SUBROUTINE transpose_yx
487
488
489	!------------------------------------------------------------------------------!
490	! Description:
491	! ------------
492	!> Transposition of input array (f_in) from y to x. For the input array, all
493	!> elements along y reside on the same PE, while after transposition, all
494	!> elements along x reside on the same PE.
495	!> This is a direct transposition for arrays with indices in regular order
496	!> (k,j,i) (cf. transpose_yx).
497	!------------------------------------------------------------------------------!
498	SUBROUTINE transpose_yxd( f_in, f_out )
499
500
501	USE cpulog, &
502	ONLY: cpu_log, cpu_log_nowait, log_point_s
503
504	USE indices, &
505	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
506
507	USE kinds
508
509	USE pegrid
510
511	USE transpose_indices, &
512	ONLY: nyn_x, nys_x, nzb_x, nzt_x
513
514	IMPLICIT NONE
515
516	INTEGER(iwp) :: i !<
517	INTEGER(iwp) :: j !<
518	INTEGER(iwp) :: k !<
519	INTEGER(iwp) :: l !<
520	INTEGER(iwp) :: m !<
521	INTEGER(iwp) :: xs !<
522
523	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
524	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
525	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
526	REAL(wp) :: work(nnxnnynnz) !<
527	#if defined( __parallel )
528
529	!
530	!-- Rearrange indices of input array in order to make data to be send
531	!-- by MPI contiguous
532	DO k = 1, nz
533	DO j = nys, nyn
534	DO i = nxl, nxr
535	f_inv(i,k,j) = f_in(k,j,i)
536	ENDDO
537	ENDDO
538	ENDDO
539
540	!
541	!-- Transpose array
542	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
543	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
544	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
545	work(1), sendrecvcount_xy, MPI_REAL, &
546	comm1dx, ierr )
547	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
548
549	!
550	!-- Reorder transposed array
551	m = 0
552	DO l = 0, pdims(1) - 1
553	xs = 0 + l * nnx
554	DO j = nys_x, nyn_x
555	DO k = 1, nz
556	DO i = xs, xs + nnx - 1
557	m = m + 1
558	f_out(i,j,k) = work(m)
559	ENDDO
560	ENDDO
561	ENDDO
562	ENDDO
563
564	#endif
565
566	END SUBROUTINE transpose_yxd
567
568
569	!------------------------------------------------------------------------------!
570	! Description:
571	! ------------
572	!> Resorting data for the transposition from y to z. The transposition itself
573	!> is carried out in transpose_yz
574	!------------------------------------------------------------------------------!
575	SUBROUTINE resort_for_yz( f_in, f_inv )
576
577
578	USE indices, &
579	ONLY: ny
580
581	USE kinds
582
583	USE transpose_indices, &
584	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
585
586	IMPLICIT NONE
587
588	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
589	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
590
591	INTEGER(iwp) :: i !<
592	INTEGER(iwp) :: j !<
593	INTEGER(iwp) :: k !<
594
595	!
596	!-- Rearrange indices of input array in order to make data to be send
597	!-- by MPI contiguous
598	!$OMP PARALLEL PRIVATE ( i, j, k )
599	!$OMP DO
600	!$acc kernels present( f_in, f_inv )
601	DO j = 0, ny
602	DO k = nzb_y, nzt_y
603	DO i = nxl_y, nxr_y
604	f_inv(i,k,j) = f_in(j,i,k)
605	ENDDO
606	ENDDO
607	ENDDO
608	!$acc end kernels
609	!$OMP END PARALLEL
610
611	END SUBROUTINE resort_for_yz
612
613
614	!------------------------------------------------------------------------------!
615	! Description:
616	! ------------
617	!> Transposition of input array (f_in) from y to z. For the input array, all
618	!> elements along y reside on the same PE, while after transposition, all
619	!> elements along z reside on the same PE.
620	!------------------------------------------------------------------------------!
621	SUBROUTINE transpose_yz( f_inv, f_out )
622
623
624	USE cpulog, &
625	ONLY: cpu_log, cpu_log_nowait, log_point_s
626
627	USE indices, &
628	ONLY: ny, nz
629
630	USE kinds
631
632	USE pegrid
633
634	USE transpose_indices, &
635	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
636
637	IMPLICIT NONE
638
639	INTEGER(iwp) :: i !<
640	INTEGER(iwp) :: j !<
641	INTEGER(iwp) :: k !<
642	INTEGER(iwp) :: l !<
643	INTEGER(iwp) :: zs !<
644
645	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
646	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
647
648	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
649
650
651	!
652	!-- If the PE grid is one-dimensional along y, only local reordering
653	!-- of the data is necessary and no transposition has to be done.
654	IF ( pdims(1) == 1 ) THEN
655
656	!$OMP PARALLEL PRIVATE ( i, j, k )
657	!$OMP DO
658	!$acc kernels present( f_inv, f_out )
659	DO j = 0, ny
660	DO k = nzb_y, nzt_y
661	DO i = nxl_y, nxr_y
662	f_out(i,j,k) = f_inv(i,k,j)
663	ENDDO
664	ENDDO
665	ENDDO
666	!$acc end kernels
667	!$OMP END PARALLEL
668
669	ELSE
670
671	#if defined( __parallel )
672	!
673	!-- Transpose array
674	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
675	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
676	!$acc update host( f_inv )
677	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
678	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
679	comm1dx, ierr )
680	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
681
682	!
683	!-- Reorder transposed array
684	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
685	!$OMP DO
686	!$acc data copyin( work )
687	DO l = 0, pdims(1) - 1
688	zs = 1 + l * ( nzt_y - nzb_y + 1 )
689	!$acc kernels present( f_out )
690	DO j = nys_z, nyn_z
691	DO k = zs, zs + nzt_y - nzb_y
692	DO i = nxl_z, nxr_z
693	f_out(i,j,k) = work(i,k-zs+1,j,l)
694	ENDDO
695	ENDDO
696	ENDDO
697	!$acc end kernels
698	ENDDO
699	!$acc end data
700	!$OMP END PARALLEL
701	#endif
702
703	ENDIF
704
705	END SUBROUTINE transpose_yz
706
707
708	!------------------------------------------------------------------------------!
709	! Description:
710	! ------------
711	!> Resorting data for the transposition from z to x. The transposition itself
712	!> is carried out in transpose_zx
713	!------------------------------------------------------------------------------!
714	SUBROUTINE resort_for_zx( f_in, f_inv )
715
716
717	USE indices, &
718	ONLY: nxl, nxr, nyn, nys, nz
719
720	USE kinds
721
722	IMPLICIT NONE
723
724	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
725	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
726
727	INTEGER(iwp) :: i !<
728	INTEGER(iwp) :: j !<
729	INTEGER(iwp) :: k !<
730
731	!
732	!-- Rearrange indices of input array in order to make data to be send
733	!-- by MPI contiguous
734	!$OMP PARALLEL PRIVATE ( i, j, k )
735	!$OMP DO
736	!$acc kernels present( f_in, f_inv )
737	DO k = 1,nz
738	DO i = nxl, nxr
739	DO j = nys, nyn
740	f_inv(j,i,k) = f_in(k,j,i)
741	ENDDO
742	ENDDO
743	ENDDO
744	!$acc end kernels
745	!$OMP END PARALLEL
746
747	END SUBROUTINE resort_for_zx
748
749
750	!------------------------------------------------------------------------------!
751	! Description:
752	! ------------
753	!> Transposition of input array (f_in) from z to x. For the input array, all
754	!> elements along z reside on the same PE, while after transposition, all
755	!> elements along x reside on the same PE.
756	!------------------------------------------------------------------------------!
757	SUBROUTINE transpose_zx( f_inv, f_out )
758
759
760	USE cpulog, &
761	ONLY: cpu_log, cpu_log_nowait, log_point_s
762
763	USE indices, &
764	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
765
766	USE kinds
767
768	USE pegrid
769
770	USE transpose_indices, &
771	ONLY: nyn_x, nys_x, nzb_x, nzt_x
772
773	IMPLICIT NONE
774
775	INTEGER(iwp) :: i !<
776	INTEGER(iwp) :: j !<
777	INTEGER(iwp) :: k !<
778	INTEGER(iwp) :: l !<
779	INTEGER(iwp) :: xs !<
780
781	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
782	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
783
784	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
785
786
787	!
788	!-- If the PE grid is one-dimensional along y, only local reordering
789	!-- of the data is necessary and no transposition has to be done.
790	IF ( pdims(1) == 1 ) THEN
791
792	!$OMP PARALLEL PRIVATE ( i, j, k )
793	!$OMP DO
794	!$acc kernels present( f_inv, f_out )
795	DO k = 1, nz
796	DO i = nxl, nxr
797	DO j = nys, nyn
798	f_out(i,j,k) = f_inv(j,i,k)
799	ENDDO
800	ENDDO
801	ENDDO
802	!$acc end kernels
803	!$OMP END PARALLEL
804
805	ELSE
806
807	#if defined( __parallel )
808	!
809	!-- Transpose array
810	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
811	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
812	!$acc update host( f_inv )
813	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
814	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
815	comm1dx, ierr )
816	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
817
818	!
819	!-- Reorder transposed array
820	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
821	!$OMP DO
822	!$acc data copyin( work )
823	DO l = 0, pdims(1) - 1
824	xs = 0 + l * nnx
825	!$acc kernels present( f_out )
826	DO k = nzb_x, nzt_x
827	DO i = xs, xs + nnx - 1
828	DO j = nys_x, nyn_x
829	f_out(i,j,k) = work(j,i-xs+1,k,l)
830	ENDDO
831	ENDDO
832	ENDDO
833	!$acc end kernels
834	ENDDO
835	!$acc end data
836	!$OMP END PARALLEL
837	#endif
838
839	ENDIF
840
841	END SUBROUTINE transpose_zx
842
843
844	!------------------------------------------------------------------------------!
845	! Description:
846	! ------------
847	!> Resorting data after the transposition from z to y. The transposition itself
848	!> is carried out in transpose_zy
849	!------------------------------------------------------------------------------!
850	SUBROUTINE resort_for_zy( f_inv, f_out )
851
852
853	USE indices, &
854	ONLY: ny
855
856	USE kinds
857
858	USE transpose_indices, &
859	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
860
861	IMPLICIT NONE
862
863	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
864	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
865
866
867	INTEGER(iwp) :: i !<
868	INTEGER(iwp) :: j !<
869	INTEGER(iwp) :: k !<
870
871	!
872	!-- Rearrange indices of input array in order to make data to be send
873	!-- by MPI contiguous
874	!$OMP PARALLEL PRIVATE ( i, j, k )
875	!$OMP DO
876	!$acc kernels present( f_inv, f_out )
877	DO k = nzb_y, nzt_y
878	DO j = 0, ny
879	DO i = nxl_y, nxr_y
880	f_out(j,i,k) = f_inv(i,k,j)
881	ENDDO
882	ENDDO
883	ENDDO
884	!$acc end kernels
885	!$OMP END PARALLEL
886
887	END SUBROUTINE resort_for_zy
888
889
890	!------------------------------------------------------------------------------!
891	! Description:
892	! ------------
893	!> Transposition of input array (f_in) from z to y. For the input array, all
894	!> elements along z reside on the same PE, while after transposition, all
895	!> elements along y reside on the same PE.
896	!------------------------------------------------------------------------------!
897	SUBROUTINE transpose_zy( f_in, f_inv )
898
899
900	USE cpulog, &
901	ONLY: cpu_log, cpu_log_nowait, log_point_s
902
903	USE indices, &
904	ONLY: ny, nz
905
906	USE kinds
907
908	USE pegrid
909
910	USE transpose_indices, &
911	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
912
913	IMPLICIT NONE
914
915	INTEGER(iwp) :: i !<
916	INTEGER(iwp) :: j !<
917	INTEGER(iwp) :: k !<
918	INTEGER(iwp) :: l !<
919	INTEGER(iwp) :: zs !<
920
921	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
922	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
923
924	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
925
926	!
927	!-- If the PE grid is one-dimensional along y, the array has only to be
928	!-- reordered locally and therefore no transposition has to be done.
929	IF ( pdims(1) /= 1 ) THEN
930
931	#if defined( __parallel )
932	!
933	!-- Reorder input array for transposition
934	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
935	!$OMP DO
936	!$acc data copyout( work )
937	DO l = 0, pdims(1) - 1
938	zs = 1 + l * ( nzt_y - nzb_y + 1 )
939	!$acc kernels present( f_in, work )
940	DO j = nys_z, nyn_z
941	DO k = zs, zs + nzt_y - nzb_y
942	DO i = nxl_z, nxr_z
943	work(i,k-zs+1,j,l) = f_in(i,j,k)
944	ENDDO
945	ENDDO
946	ENDDO
947	!$acc end kernels
948	ENDDO
949	!$acc end data
950	!$OMP END PARALLEL
951
952	!
953	!-- Transpose array
954	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
955	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
956	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
957	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
958	comm1dx, ierr )
959	!$acc update device( f_inv )
960	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
961	#endif
962
963	ELSE
964	!
965	!-- Reorder the array in the same way like ALLTOALL did it
966	!$OMP PARALLEL PRIVATE ( i, j, k )
967	!$OMP DO
968	!$acc kernels present( f_in, f_inv )
969	DO k = nzb_y, nzt_y
970	DO j = 0, ny
971	DO i = nxl_y, nxr_y
972	f_inv(i,k,j) = f_in(i,j,k)
973	ENDDO
974	ENDDO
975	ENDDO
976	!$acc end kernels
977	!$OMP END PARALLEL
978
979	ENDIF
980
981	END SUBROUTINE transpose_zy
982
983
984	!------------------------------------------------------------------------------!
985	! Description:
986	! ------------
987	!> Transposition of input array (f_in) from z to y. For the input array, all
988	!> elements along z reside on the same PE, while after transposition, all
989	!> elements along y reside on the same PE.
990	!> This is a direct transposition for arrays with indices in regular order
991	!> (k,j,i) (cf. transpose_zy).
992	!------------------------------------------------------------------------------!
993	SUBROUTINE transpose_zyd( f_in, f_out )
994
995
996	USE cpulog, &
997	ONLY: cpu_log, cpu_log_nowait, log_point_s
998
999	USE indices, &
1000	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1001
1002	USE kinds
1003
1004	USE pegrid
1005
1006	USE transpose_indices, &
1007	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
1008
1009	IMPLICIT NONE
1010
1011	INTEGER(iwp) :: i !<
1012	INTEGER(iwp) :: j !<
1013	INTEGER(iwp) :: k !<
1014	INTEGER(iwp) :: l !<
1015	INTEGER(iwp) :: m !<
1016	INTEGER(iwp) :: ys !<
1017
1018	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1019	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1020	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1021	REAL(wp) :: work(nnxnnynnz) !<
1022
1023	#if defined( __parallel )
1024
1025	!
1026	!-- Rearrange indices of input array in order to make data to be send
1027	!-- by MPI contiguous
1028	DO i = nxl, nxr
1029	DO j = nys, nyn
1030	DO k = 1, nz
1031	f_inv(j,i,k) = f_in(k,j,i)
1032	ENDDO
1033	ENDDO
1034	ENDDO
1035
1036	!
1037	!-- Move data to different array, because memory location of work1 is
1038	!-- needed further below (work1 = work2).
1039	!-- If the PE grid is one-dimensional along x, only local reordering
1040	!-- of the data is necessary and no transposition has to be done.
1041	IF ( pdims(2) == 1 ) THEN
1042	DO k = 1, nz
1043	DO i = nxl, nxr
1044	DO j = nys, nyn
1045	f_out(j,i,k) = f_inv(j,i,k)
1046	ENDDO
1047	ENDDO
1048	ENDDO
1049	RETURN
1050	ENDIF
1051
1052	!
1053	!-- Transpose array
1054	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1055	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1056	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1057	work(1), sendrecvcount_zyd, MPI_REAL, &
1058	comm1dy, ierr )
1059	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1060
1061	!
1062	!-- Reorder transposed array
1063	m = 0
1064	DO l = 0, pdims(2) - 1
1065	ys = 0 + l * nny
1066	DO k = nzb_yd, nzt_yd
1067	DO i = nxl_yd, nxr_yd
1068	DO j = ys, ys + nny - 1
1069	m = m + 1
1070	f_out(j,i,k) = work(m)
1071	ENDDO
1072	ENDDO
1073	ENDDO
1074	ENDDO
1075
1076	#endif
1077
1078	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |