Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1321

Last change on this file since 1321 was 1321, checked in by raasch, 10 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 32.2 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 1321 2014-03-20 09:40:40Z raasch $
27	!
28	! 1320 2014-03-20 08:40:49Z raasch
29	! ONLY-attribute added to USE-statements,
30	! kind-parameters added to all INTEGER and REAL declaration statements,
31	! kinds are defined in new module kinds,
32	! old module precision_kind is removed,
33	! revision history before 2012 removed,
34	! comment fields (!:) to be used for variable explanations added to
35	! all variable declaration statements
36	!
37	! 1318 2014-03-17 13:35:16Z raasch
38	! cpu_log_nowait parameter added to cpu measurements of the transpositions
39	! required for solving the Poisson equation (poisfft),
40	! module interfaces removed
41	!
42	! 1257 2013-11-08 15:18:40Z raasch
43	! openacc loop and loop vector clauses removed
44	!
45	! 1216 2013-08-26 09:31:42Z raasch
46	! re-sorting of the transposed / to be transposed arrays moved to separate
47	! routines resort_for_...
48	!
49	! 1111 2013-03-08 23:54:10Z raasch
50	! openACC directives added,
51	! resorting data from/to work changed, work got 4 dimensions instead of 1
52	!
53	! 1106 2013-03-04 05:31:38Z raasch
54	! preprocessor lines rearranged so that routines can also be used in serial
55	! (non-parallel) mode
56	!
57	! 1092 2013-02-02 11:24:22Z raasch
58	! unused variables removed
59	!
60	! 1036 2012-10-22 13:43:42Z raasch
61	! code put under GPL (PALM 3.9)
62	!
63	! 1003 2012-09-14 14:35:53Z raasch
64	! indices nxa, nya, etc. replaced by nx, ny, etc.
65	!
66	! Revision 1.1 1997/07/24 11:25:18 raasch
67	! Initial revision
68	!
69	!------------------------------------------------------------------------------!
70	! Description:
71	! ------------
72	! Resorting data for the transposition from x to y. The transposition itself
73	! is carried out in transpose_xy
74	!------------------------------------------------------------------------------!
75
76	USE indices, &
77	ONLY: nx
78
79	USE kinds
80
81	USE transpose_indices, &
82	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
83
84	IMPLICIT NONE
85
86	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
87	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
88
89
90	INTEGER(iwp) :: i !:
91	INTEGER(iwp) :: j !:
92	INTEGER(iwp) :: k !:
93	!
94	!-- Rearrange indices of input array in order to make data to be send
95	!-- by MPI contiguous
96	!$OMP PARALLEL PRIVATE ( i, j, k )
97	!$OMP DO
98	!$acc kernels present( f_in, f_inv )
99	DO i = 0, nx
100	DO k = nzb_x, nzt_x
101	DO j = nys_x, nyn_x
102	f_inv(j,k,i) = f_in(i,j,k)
103	ENDDO
104	ENDDO
105	ENDDO
106	!$acc end kernels
107	!$OMP END PARALLEL
108
109	END SUBROUTINE resort_for_xy
110
111
112	SUBROUTINE transpose_xy( f_inv, f_out )
113
114	!------------------------------------------------------------------------------!
115	! Description:
116	! ------------
117	! Transposition of input array (f_in) from x to y. For the input array, all
118	! elements along x reside on the same PE, while after transposition, all
119	! elements along y reside on the same PE.
120	!------------------------------------------------------------------------------!
121
122	USE cpulog, &
123	ONLY: cpu_log, cpu_log_nowait, log_point_s
124
125	USE indices, &
126	ONLY: nx, ny
127
128	USE kinds
129
130	USE pegrid
131
132	USE transpose_indices, &
133	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
134
135	IMPLICIT NONE
136
137	INTEGER(iwp) :: i !:
138	INTEGER(iwp) :: j !:
139	INTEGER(iwp) :: k !:
140	INTEGER(iwp) :: l !:
141	INTEGER(iwp) :: ys !:
142
143	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
144	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
145
146	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
147
148
149	IF ( numprocs /= 1 ) THEN
150
151	#if defined( __parallel )
152	!
153	!-- Transpose array
154	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
155	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
156	!$acc update host( f_inv )
157	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
158	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
159	comm1dy, ierr )
160	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
161
162	!
163	!-- Reorder transposed array
164	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
165	!$OMP DO
166	!$acc data copyin( work )
167	DO l = 0, pdims(2) - 1
168	ys = 0 + l * ( nyn_x - nys_x + 1 )
169	!$acc kernels present( f_out, work )
170	DO i = nxl_y, nxr_y
171	DO k = nzb_y, nzt_y
172	DO j = ys, ys + nyn_x - nys_x
173	f_out(j,i,k) = work(j-ys+1,k,i,l)
174	ENDDO
175	ENDDO
176	ENDDO
177	!$acc end kernels
178	ENDDO
179	!$acc end data
180	!$OMP END PARALLEL
181	#endif
182
183	ELSE
184
185	!
186	!-- Reorder transposed array
187	!$OMP PARALLEL PRIVATE ( i, j, k )
188	!$OMP DO
189	!$acc kernels present( f_inv, f_out )
190	DO k = nzb_y, nzt_y
191	DO i = nxl_y, nxr_y
192	DO j = 0, ny
193	f_out(j,i,k) = f_inv(j,k,i)
194	ENDDO
195	ENDDO
196	ENDDO
197	!$acc end kernels
198	!$OMP END PARALLEL
199
200	ENDIF
201
202	END SUBROUTINE transpose_xy
203
204
205	SUBROUTINE resort_for_xz( f_inv, f_out )
206
207	!------------------------------------------------------------------------------!
208	! Description:
209	! ------------
210	! Resorting data after the transposition from x to z. The transposition itself
211	! is carried out in transpose_xz
212	!------------------------------------------------------------------------------!
213
214	USE indices, &
215	ONLY: nxl, nxr, nyn, nys, nz
216
217	USE kinds
218
219	IMPLICIT NONE
220
221	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
222	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !:
223
224	INTEGER(iwp) :: i !:
225	INTEGER(iwp) :: j !:
226	INTEGER(iwp) :: k !:
227	!
228	!-- Rearrange indices of input array in order to make data to be send
229	!-- by MPI contiguous.
230	!-- In case of parallel fft/transposition, scattered store is faster in
231	!-- backward direction!!!
232	!$OMP PARALLEL PRIVATE ( i, j, k )
233	!$OMP DO
234	!$acc kernels present( f_inv, f_out )
235	DO k = 1, nz
236	DO i = nxl, nxr
237	DO j = nys, nyn
238	f_out(k,j,i) = f_inv(j,i,k)
239	ENDDO
240	ENDDO
241	ENDDO
242	!$acc end kernels
243	!$OMP END PARALLEL
244
245	END SUBROUTINE resort_for_xz
246
247
248	SUBROUTINE transpose_xz( f_in, f_inv )
249
250	!------------------------------------------------------------------------------!
251	! Description:
252	! ------------
253	! Transposition of input array (f_in) from x to z. For the input array, all
254	! elements along x reside on the same PE, while after transposition, all
255	! elements along z reside on the same PE.
256	!------------------------------------------------------------------------------!
257
258	USE cpulog, &
259	ONLY: cpu_log, cpu_log_nowait, log_point_s
260
261	USE indices, &
262	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
263
264	USE kinds
265
266	USE pegrid, &
267	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
268	pdims, sendrecvcount_zx
269
270	USE transpose_indices, &
271	ONLY: nyn_x, nys_x, nzb_x, nzt_x
272
273	IMPLICIT NONE
274
275	INTEGER(iwp) :: i !:
276	INTEGER(iwp) :: j !:
277	INTEGER(iwp) :: k !:
278	INTEGER(iwp) :: l !:
279	INTEGER(iwp) :: xs !:
280
281	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
282	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
283
284	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
285
286
287	!
288	!-- If the PE grid is one-dimensional along y, the array has only to be
289	!-- reordered locally and therefore no transposition has to be done.
290	IF ( pdims(1) /= 1 ) THEN
291
292	#if defined( __parallel )
293	!
294	!-- Reorder input array for transposition
295	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
296	!$OMP DO
297	!$acc data copyout( work )
298	DO l = 0, pdims(1) - 1
299	xs = 0 + l * nnx
300	!$acc kernels present( f_in, work )
301	DO k = nzb_x, nzt_x
302	DO i = xs, xs + nnx - 1
303	DO j = nys_x, nyn_x
304	work(j,i-xs+1,k,l) = f_in(i,j,k)
305	ENDDO
306	ENDDO
307	ENDDO
308	!$acc end kernels
309	ENDDO
310	!$acc end data
311	!$OMP END PARALLEL
312
313	!
314	!-- Transpose array
315	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
316	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
317	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
318	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
319	comm1dx, ierr )
320	!$acc update device( f_inv )
321	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
322	#endif
323
324	ELSE
325
326	!
327	!-- Reorder the array in a way that the z index is in first position
328	!$OMP PARALLEL PRIVATE ( i, j, k )
329	!$OMP DO
330	!$acc kernels present( f_in, f_inv )
331	DO i = nxl, nxr
332	DO j = nys, nyn
333	DO k = 1, nz
334	f_inv(j,i,k) = f_in(i,j,k)
335	ENDDO
336	ENDDO
337	ENDDO
338	!$acc end kernels
339	!$OMP END PARALLEL
340
341	ENDIF
342
343	END SUBROUTINE transpose_xz
344
345
346	SUBROUTINE resort_for_yx( f_inv, f_out )
347
348	!------------------------------------------------------------------------------!
349	! Description:
350	! ------------
351	! Resorting data after the transposition from y to x. The transposition itself
352	! is carried out in transpose_yx
353	!------------------------------------------------------------------------------!
354
355	USE indices, &
356	ONLY: nx
357
358	USE kinds
359
360	USE transpose_indices, &
361	ONLY: nyn_x, nys_x, nzb_x, nzt_x
362
363	IMPLICIT NONE
364
365	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
366	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
367
368
369	INTEGER(iwp) :: i !:
370	INTEGER(iwp) :: j !:
371	INTEGER(iwp) :: k !:
372	!
373	!-- Rearrange indices of input array in order to make data to be send
374	!-- by MPI contiguous
375	!$OMP PARALLEL PRIVATE ( i, j, k )
376	!$OMP DO
377	!$acc kernels present( f_inv, f_out )
378	DO i = 0, nx
379	DO k = nzb_x, nzt_x
380	DO j = nys_x, nyn_x
381	f_out(i,j,k) = f_inv(j,k,i)
382	ENDDO
383	ENDDO
384	ENDDO
385	!$acc end kernels
386	!$OMP END PARALLEL
387
388	END SUBROUTINE resort_for_yx
389
390
391	SUBROUTINE transpose_yx( f_in, f_inv )
392
393	!------------------------------------------------------------------------------!
394	! Description:
395	! ------------
396	! Transposition of input array (f_in) from y to x. For the input array, all
397	! elements along y reside on the same PE, while after transposition, all
398	! elements along x reside on the same PE.
399	!------------------------------------------------------------------------------!
400
401	USE cpulog, &
402	ONLY: cpu_log, cpu_log_nowait, log_point_s
403
404	USE indices, &
405	ONLY: nx, ny
406
407	USE kinds
408
409	USE pegrid, &
410	ONLY: collective_wait, comm1dy, comm2d, ierr, MPI_DOUBLE_PRECISION, &
411	numprocs, pdims, sendrecvcount_xy
412
413	USE transpose_indices, &
414	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
415
416	IMPLICIT NONE
417
418	INTEGER(iwp) :: i !:
419	INTEGER(iwp) :: j !:
420	INTEGER(iwp) :: k !:
421	INTEGER(iwp) :: l !:
422	INTEGER(iwp) :: ys !:
423
424	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
425	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
426
427	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
428
429
430	IF ( numprocs /= 1 ) THEN
431
432	#if defined( __parallel )
433	!
434	!-- Reorder input array for transposition
435	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
436	!$OMP DO
437	!$acc data copyout( work )
438	DO l = 0, pdims(2) - 1
439	ys = 0 + l * ( nyn_x - nys_x + 1 )
440	!$acc kernels present( f_in, work )
441	DO i = nxl_y, nxr_y
442	DO k = nzb_y, nzt_y
443	DO j = ys, ys + nyn_x - nys_x
444	work(j-ys+1,k,i,l) = f_in(j,i,k)
445	ENDDO
446	ENDDO
447	ENDDO
448	!$acc end kernels
449	ENDDO
450	!$acc end data
451	!$OMP END PARALLEL
452
453	!
454	!-- Transpose array
455	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
456	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
457	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
458	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
459	comm1dy, ierr )
460	!$acc update device( f_inv )
461	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
462	#endif
463
464	ELSE
465
466	!
467	!-- Reorder array f_in the same way as ALLTOALL did it
468	!$OMP PARALLEL PRIVATE ( i, j, k )
469	!$OMP DO
470	!$acc kernels present( f_in, f_inv )
471	DO i = nxl_y, nxr_y
472	DO k = nzb_y, nzt_y
473	DO j = 0, ny
474	f_inv(j,k,i) = f_in(j,i,k)
475	ENDDO
476	ENDDO
477	ENDDO
478	!$acc end kernels
479	!$OMP END PARALLEL
480
481	ENDIF
482
483	END SUBROUTINE transpose_yx
484
485
486	SUBROUTINE transpose_yxd( f_in, f_out )
487
488	!------------------------------------------------------------------------------!
489	! Description:
490	! ------------
491	! Transposition of input array (f_in) from y to x. For the input array, all
492	! elements along y reside on the same PE, while after transposition, all
493	! elements along x reside on the same PE.
494	! This is a direct transposition for arrays with indices in regular order
495	! (k,j,i) (cf. transpose_yx).
496	!------------------------------------------------------------------------------!
497
498	USE cpulog, &
499	ONLY: cpu_log, cpu_log_nowait, log_point_s
500
501	USE indices, &
502	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
503
504	USE kinds
505
506	USE pegrid, &
507	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
508	pdims, sendrecvcount_xy
509
510	USE transpose_indices, &
511	ONLY: nyn_x, nys_x, nzb_x, nzt_x
512
513	IMPLICIT NONE
514
515	INTEGER(iwp) :: i !:
516	INTEGER(iwp) :: j !:
517	INTEGER(iwp) :: k !:
518	INTEGER(iwp) :: l !:
519	INTEGER(iwp) :: m !:
520	INTEGER(iwp) :: xs !:
521
522	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
523	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !:
524	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
525	REAL(wp) :: work(nnxnnynnz) !:
526	#if defined( __parallel )
527
528	!
529	!-- Rearrange indices of input array in order to make data to be send
530	!-- by MPI contiguous
531	DO k = 1, nz
532	DO j = nys, nyn
533	DO i = nxl, nxr
534	f_inv(i,k,j) = f_in(k,j,i)
535	ENDDO
536	ENDDO
537	ENDDO
538
539	!
540	!-- Transpose array
541	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
542	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
543	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
544	work(1), sendrecvcount_xy, MPI_REAL, &
545	comm1dx, ierr )
546	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
547
548	!
549	!-- Reorder transposed array
550	m = 0
551	DO l = 0, pdims(1) - 1
552	xs = 0 + l * nnx
553	DO j = nys_x, nyn_x
554	DO k = 1, nz
555	DO i = xs, xs + nnx - 1
556	m = m + 1
557	f_out(i,j,k) = work(m)
558	ENDDO
559	ENDDO
560	ENDDO
561	ENDDO
562
563	#endif
564
565	END SUBROUTINE transpose_yxd
566
567
568	SUBROUTINE resort_for_yz( f_in, f_inv )
569
570	!------------------------------------------------------------------------------!
571	! Description:
572	! ------------
573	! Resorting data for the transposition from y to z. The transposition itself
574	! is carried out in transpose_yz
575	!------------------------------------------------------------------------------!
576
577	USE indices, &
578	ONLY: ny
579
580	USE kinds
581
582	USE transpose_indices, &
583	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
584
585	IMPLICIT NONE
586
587	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
588	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
589
590	INTEGER(iwp) :: i !:
591	INTEGER(iwp) :: j !:
592	INTEGER(iwp) :: k !:
593
594	!
595	!-- Rearrange indices of input array in order to make data to be send
596	!-- by MPI contiguous
597	!$OMP PARALLEL PRIVATE ( i, j, k )
598	!$OMP DO
599	!$acc kernels present( f_in, f_inv )
600	DO j = 0, ny
601	DO k = nzb_y, nzt_y
602	DO i = nxl_y, nxr_y
603	f_inv(i,k,j) = f_in(j,i,k)
604	ENDDO
605	ENDDO
606	ENDDO
607	!$acc end kernels
608	!$OMP END PARALLEL
609
610	END SUBROUTINE resort_for_yz
611
612
613	SUBROUTINE transpose_yz( f_inv, f_out )
614
615	!------------------------------------------------------------------------------!
616	! Description:
617	! ------------
618	! Transposition of input array (f_in) from y to z. For the input array, all
619	! elements along y reside on the same PE, while after transposition, all
620	! elements along z reside on the same PE.
621	!------------------------------------------------------------------------------!
622
623	USE cpulog, &
624	ONLY: cpu_log, cpu_log_nowait, log_point_s
625
626	USE indices, &
627	ONLY: ny, nz
628
629	USE kinds
630
631	USE pegrid, &
632	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
633	pdims, sendrecvcount_yz
634
635	USE transpose_indices, &
636	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
637
638	IMPLICIT NONE
639
640	INTEGER(iwp) :: i !:
641	INTEGER(iwp) :: j !:
642	INTEGER(iwp) :: k !:
643	INTEGER(iwp) :: l !:
644	INTEGER(iwp) :: zs !:
645
646	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
647	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
648
649	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
650
651
652	!
653	!-- If the PE grid is one-dimensional along y, only local reordering
654	!-- of the data is necessary and no transposition has to be done.
655	IF ( pdims(1) == 1 ) THEN
656
657	!$OMP PARALLEL PRIVATE ( i, j, k )
658	!$OMP DO
659	!$acc kernels present( f_inv, f_out )
660	DO j = 0, ny
661	DO k = nzb_y, nzt_y
662	DO i = nxl_y, nxr_y
663	f_out(i,j,k) = f_inv(i,k,j)
664	ENDDO
665	ENDDO
666	ENDDO
667	!$acc end kernels
668	!$OMP END PARALLEL
669
670	ELSE
671
672	#if defined( __parallel )
673	!
674	!-- Transpose array
675	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
676	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
677	!$acc update host( f_inv )
678	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
679	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
680	comm1dx, ierr )
681	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
682
683	!
684	!-- Reorder transposed array
685	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
686	!$OMP DO
687	!$acc data copyin( work )
688	DO l = 0, pdims(1) - 1
689	zs = 1 + l * ( nzt_y - nzb_y + 1 )
690	!$acc kernels present( f_out )
691	DO j = nys_z, nyn_z
692	DO k = zs, zs + nzt_y - nzb_y
693	DO i = nxl_z, nxr_z
694	f_out(i,j,k) = work(i,k-zs+1,j,l)
695	ENDDO
696	ENDDO
697	ENDDO
698	!$acc end kernels
699	ENDDO
700	!$acc end data
701	!$OMP END PARALLEL
702	#endif
703
704	ENDIF
705
706	END SUBROUTINE transpose_yz
707
708
709	SUBROUTINE resort_for_zx( f_in, f_inv )
710
711	!------------------------------------------------------------------------------!
712	! Description:
713	! ------------
714	! Resorting data for the transposition from z to x. The transposition itself
715	! is carried out in transpose_zx
716	!------------------------------------------------------------------------------!
717
718	USE indices, &
719	ONLY: nxl, nxr, nyn, nys, nz
720
721	USE kinds
722
723	IMPLICIT NONE
724
725	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
726	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
727
728	INTEGER(iwp) :: i !:
729	INTEGER(iwp) :: j !:
730	INTEGER(iwp) :: k !:
731
732	!
733	!-- Rearrange indices of input array in order to make data to be send
734	!-- by MPI contiguous
735	!$OMP PARALLEL PRIVATE ( i, j, k )
736	!$OMP DO
737	!$acc kernels present( f_in, f_inv )
738	DO k = 1,nz
739	DO i = nxl, nxr
740	DO j = nys, nyn
741	f_inv(j,i,k) = f_in(k,j,i)
742	ENDDO
743	ENDDO
744	ENDDO
745	!$acc end kernels
746	!$OMP END PARALLEL
747
748	END SUBROUTINE resort_for_zx
749
750
751	SUBROUTINE transpose_zx( f_inv, f_out )
752
753	!------------------------------------------------------------------------------!
754	! Description:
755	! ------------
756	! Transposition of input array (f_in) from z to x. For the input array, all
757	! elements along z reside on the same PE, while after transposition, all
758	! elements along x reside on the same PE.
759	!------------------------------------------------------------------------------!
760
761	USE cpulog, &
762	ONLY: cpu_log, cpu_log_nowait, log_point_s
763
764	USE indices, &
765	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
766
767	USE kinds
768
769	USE pegrid, &
770	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
771	pdims, sendrecvcount_zx
772
773	USE transpose_indices, &
774	ONLY: nyn_x, nys_x, nzb_x, nzt_x
775
776	IMPLICIT NONE
777
778	INTEGER(iwp) :: i !:
779	INTEGER(iwp) :: j !:
780	INTEGER(iwp) :: k !:
781	INTEGER(iwp) :: l !:
782	INTEGER(iwp) :: xs !:
783
784	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
785	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
786
787	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
788
789
790	!
791	!-- If the PE grid is one-dimensional along y, only local reordering
792	!-- of the data is necessary and no transposition has to be done.
793	IF ( pdims(1) == 1 ) THEN
794
795	!$OMP PARALLEL PRIVATE ( i, j, k )
796	!$OMP DO
797	!$acc kernels present( f_inv, f_out )
798	DO k = 1, nz
799	DO i = nxl, nxr
800	DO j = nys, nyn
801	f_out(i,j,k) = f_inv(j,i,k)
802	ENDDO
803	ENDDO
804	ENDDO
805	!$acc end kernels
806	!$OMP END PARALLEL
807
808	ELSE
809
810	#if defined( __parallel )
811	!
812	!-- Transpose array
813	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
814	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
815	!$acc update host( f_inv )
816	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
817	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
818	comm1dx, ierr )
819	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
820
821	!
822	!-- Reorder transposed array
823	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
824	!$OMP DO
825	!$acc data copyin( work )
826	DO l = 0, pdims(1) - 1
827	xs = 0 + l * nnx
828	!$acc kernels present( f_out )
829	DO k = nzb_x, nzt_x
830	DO i = xs, xs + nnx - 1
831	DO j = nys_x, nyn_x
832	f_out(i,j,k) = work(j,i-xs+1,k,l)
833	ENDDO
834	ENDDO
835	ENDDO
836	!$acc end kernels
837	ENDDO
838	!$acc end data
839	!$OMP END PARALLEL
840	#endif
841
842	ENDIF
843
844	END SUBROUTINE transpose_zx
845
846
847	SUBROUTINE resort_for_zy( f_inv, f_out )
848
849	!------------------------------------------------------------------------------!
850	! Description:
851	! ------------
852	! Resorting data after the transposition from z to y. The transposition itself
853	! is carried out in transpose_zy
854	!------------------------------------------------------------------------------!
855
856	USE indices, &
857	ONLY: ny
858
859	USE kinds
860
861	USE transpose_indices, &
862	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
863
864	IMPLICIT NONE
865
866	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
867	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
868
869
870	INTEGER(iwp) :: i !:
871	INTEGER(iwp) :: j !:
872	INTEGER(iwp) :: k !:
873
874	!
875	!-- Rearrange indices of input array in order to make data to be send
876	!-- by MPI contiguous
877	!$OMP PARALLEL PRIVATE ( i, j, k )
878	!$OMP DO
879	!$acc kernels present( f_inv, f_out )
880	DO k = nzb_y, nzt_y
881	DO j = 0, ny
882	DO i = nxl_y, nxr_y
883	f_out(j,i,k) = f_inv(i,k,j)
884	ENDDO
885	ENDDO
886	ENDDO
887	!$acc end kernels
888	!$OMP END PARALLEL
889
890	END SUBROUTINE resort_for_zy
891
892
893	SUBROUTINE transpose_zy( f_in, f_inv )
894
895	!------------------------------------------------------------------------------!
896	! Description:
897	! ------------
898	! Transposition of input array (f_in) from z to y. For the input array, all
899	! elements along z reside on the same PE, while after transposition, all
900	! elements along y reside on the same PE.
901	!------------------------------------------------------------------------------!
902
903	USE cpulog, &
904	ONLY: cpu_log, cpu_log_nowait, log_point_s
905
906	USE indices, &
907	ONLY: ny, nz
908
909	USE kinds
910
911	USE pegrid, &
912	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
913	pdims, sendrecvcount_yz
914
915	USE transpose_indices, &
916	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
917
918	IMPLICIT NONE
919
920	INTEGER(iwp) :: i !:
921	INTEGER(iwp) :: j !:
922	INTEGER(iwp) :: k !:
923	INTEGER(iwp) :: l !:
924	INTEGER(iwp) :: zs !:
925
926	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
927	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
928
929	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
930
931	!
932	!-- If the PE grid is one-dimensional along y, the array has only to be
933	!-- reordered locally and therefore no transposition has to be done.
934	IF ( pdims(1) /= 1 ) THEN
935
936	#if defined( __parallel )
937	!
938	!-- Reorder input array for transposition
939	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
940	!$OMP DO
941	!$acc data copyout( work )
942	DO l = 0, pdims(1) - 1
943	zs = 1 + l * ( nzt_y - nzb_y + 1 )
944	!$acc kernels present( f_in, work )
945	DO j = nys_z, nyn_z
946	DO k = zs, zs + nzt_y - nzb_y
947	DO i = nxl_z, nxr_z
948	work(i,k-zs+1,j,l) = f_in(i,j,k)
949	ENDDO
950	ENDDO
951	ENDDO
952	!$acc end kernels
953	ENDDO
954	!$acc end data
955	!$OMP END PARALLEL
956
957	!
958	!-- Transpose array
959	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
960	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
961	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
962	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
963	comm1dx, ierr )
964	!$acc update device( f_inv )
965	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
966	#endif
967
968	ELSE
969	!
970	!-- Reorder the array in the same way like ALLTOALL did it
971	!$OMP PARALLEL PRIVATE ( i, j, k )
972	!$OMP DO
973	!$acc kernels present( f_in, f_inv )
974	DO k = nzb_y, nzt_y
975	DO j = 0, ny
976	DO i = nxl_y, nxr_y
977	f_inv(i,k,j) = f_in(i,j,k)
978	ENDDO
979	ENDDO
980	ENDDO
981	!$acc end kernels
982	!$OMP END PARALLEL
983
984	ENDIF
985
986	END SUBROUTINE transpose_zy
987
988
989	SUBROUTINE transpose_zyd( f_in, f_out )
990
991	!------------------------------------------------------------------------------!
992	! Description:
993	! ------------
994	! Transposition of input array (f_in) from z to y. For the input array, all
995	! elements along z reside on the same PE, while after transposition, all
996	! elements along y reside on the same PE.
997	! This is a direct transposition for arrays with indices in regular order
998	! (k,j,i) (cf. transpose_zy).
999	!------------------------------------------------------------------------------!
1000
1001	USE cpulog, &
1002	ONLY: cpu_log, cpu_log_nowait, log_point_s
1003
1004	USE indices, &
1005	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1006
1007	USE kinds
1008
1009	USE pegrid, &
1010	ONLY: collective_wait, comm1dy, comm2d, ierr, MPI_DOUBLE_PRECISION, &
1011	pdims, sendrecvcount_zyd
1012
1013	USE transpose_indices, &
1014	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
1015
1016	IMPLICIT NONE
1017
1018	INTEGER(iwp) :: i !:
1019	INTEGER(iwp) :: j !:
1020	INTEGER(iwp) :: k !:
1021	INTEGER(iwp) :: l !:
1022	INTEGER(iwp) :: m !:
1023	INTEGER(iwp) :: ys !:
1024
1025	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
1026	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
1027	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !:
1028	REAL(wp) :: work(nnxnnynnz) !:
1029
1030	#if defined( __parallel )
1031
1032	!
1033	!-- Rearrange indices of input array in order to make data to be send
1034	!-- by MPI contiguous
1035	DO i = nxl, nxr
1036	DO j = nys, nyn
1037	DO k = 1, nz
1038	f_inv(j,i,k) = f_in(k,j,i)
1039	ENDDO
1040	ENDDO
1041	ENDDO
1042
1043	!
1044	!-- Move data to different array, because memory location of work1 is
1045	!-- needed further below (work1 = work2).
1046	!-- If the PE grid is one-dimensional along x, only local reordering
1047	!-- of the data is necessary and no transposition has to be done.
1048	IF ( pdims(2) == 1 ) THEN
1049	DO k = 1, nz
1050	DO i = nxl, nxr
1051	DO j = nys, nyn
1052	f_out(j,i,k) = f_inv(j,i,k)
1053	ENDDO
1054	ENDDO
1055	ENDDO
1056	RETURN
1057	ENDIF
1058
1059	!
1060	!-- Transpose array
1061	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1062	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1063	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1064	work(1), sendrecvcount_zyd, MPI_REAL, &
1065	comm1dy, ierr )
1066	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1067
1068	!
1069	!-- Reorder transposed array
1070	m = 0
1071	DO l = 0, pdims(2) - 1
1072	ys = 0 + l * nny
1073	DO k = nzb_yd, nzt_yd
1074	DO i = nxl_yd, nxr_yd
1075	DO j = ys, ys + nny - 1
1076	m = m + 1
1077	f_out(j,i,k) = work(m)
1078	ENDDO
1079	ENDDO
1080	ENDDO
1081	ENDDO
1082
1083	#endif
1084
1085	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |