Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1165

Last change on this file since 1165 was 1112, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 23.9 KB

Line
1	SUBROUTINE transpose_xy( f_in, work, f_out )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 1112 2013-03-09 00:34:37Z raasch $
27	!
28	! 1111 2013-03-08 23:54:10Z raasch
29	! openACC directives added,
30	! resorting data from/to work changed, work got 4 dimensions instead of 1
31	!
32	! 1106 2013-03-04 05:31:38Z raasch
33	! preprocessor lines rearranged so that routines can also be used in serial
34	! (non-parallel) mode
35	!
36	! 1092 2013-02-02 11:24:22Z raasch
37	! unused variables removed
38	!
39	! 1036 2012-10-22 13:43:42Z raasch
40	! code put under GPL (PALM 3.9)
41	!
42	! 1003 2012-09-14 14:35:53Z raasch
43	! indices nxa, nya, etc. replaced by nx, ny, etc.
44	!
45	! 683 2011-02-09 14:25:15Z raasch
46	! openMP parallelization of transpositions for 2d-domain-decomposition
47	!
48	! 622 2010-12-10 08:08:13Z raasch
49	! optional barriers included in order to speed up collective operations
50	!
51	! 164 2008-05-15 08:46:15Z raasch
52	! f_inv changed from subroutine argument to automatic array in order to do
53	! re-ordering from f_in to f_inv in one step, one array work is needed instead
54	! of work1 and work2
55	!
56	! February 2007
57	! RCS Log replace by Id keyword, revision history cleaned up
58	!
59	! Revision 1.2 2004/04/30 13:12:17 raasch
60	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
61	! all former transpose-routine files collected in this file, enlarged
62	! transposition arrays introduced
63	!
64	! Revision 1.1 2004/04/30 13:08:16 raasch
65	! Initial revision (collection of former routines transpose_xy, transpose_xz,
66	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
67	!
68	! Revision 1.1 1997/07/24 11:25:18 raasch
69	! Initial revision
70	!
71	!
72	! Description:
73	! ------------
74	! Transposition of input array (f_in) from x to y. For the input array, all
75	! elements along x reside on the same PE, while after transposition, all
76	! elements along y reside on the same PE.
77	!------------------------------------------------------------------------------!
78
79	USE cpulog
80	USE indices
81	USE interfaces
82	USE pegrid
83	USE transpose_indices
84
85	IMPLICIT NONE
86
87	INTEGER :: i, j, k, l, ys
88
89	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
90
91	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
92
93	!$acc declare create( f_inv )
94	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
95
96
97	!
98	!-- Rearrange indices of input array in order to make data to be send
99	!-- by MPI contiguous
100	!$OMP PARALLEL PRIVATE ( i, j, k )
101	!$OMP DO
102	!$acc kernels present( f_in )
103	!$acc loop
104	DO i = 0, nx
105	DO k = nzb_x, nzt_x
106	!$acc loop vector( 32 )
107	DO j = nys_x, nyn_x
108	f_inv(j,k,i) = f_in(i,j,k)
109	ENDDO
110	ENDDO
111	ENDDO
112	!$acc end kernels
113	!$OMP END PARALLEL
114
115	IF ( numprocs /= 1 ) THEN
116
117	#if defined( __parallel )
118	!
119	!-- Transpose array
120	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
121	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
122	!$acc update host( f_inv )
123	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
124	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
125	comm1dy, ierr )
126	!$acc update device( work )
127	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
128
129	!
130	!-- Reorder transposed array
131	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
132	!$OMP DO
133	DO l = 0, pdims(2) - 1
134	ys = 0 + l * ( nyn_x - nys_x + 1 )
135	!$acc kernels present( f_out, work )
136	!$acc loop
137	DO i = nxl_y, nxr_y
138	DO k = nzb_y, nzt_y
139	!$acc loop vector( 32 )
140	DO j = ys, ys + nyn_x - nys_x
141	f_out(j,i,k) = work(j-ys+1,k,i,l)
142	ENDDO
143	ENDDO
144	ENDDO
145	!$acc end kernels
146	ENDDO
147	!$OMP END PARALLEL
148	#endif
149
150	ELSE
151
152	!
153	!-- Reorder transposed array
154	!$OMP PARALLEL PRIVATE ( i, j, k )
155	!$OMP DO
156	!$acc kernels present( f_out )
157	!$acc loop
158	DO k = nzb_y, nzt_y
159	DO i = nxl_y, nxr_y
160	!$acc loop vector( 32 )
161	DO j = 0, ny
162	f_out(j,i,k) = f_inv(j,k,i)
163	ENDDO
164	ENDDO
165	ENDDO
166	!$acc end kernels
167	!$OMP END PARALLEL
168
169	ENDIF
170
171	END SUBROUTINE transpose_xy
172
173
174	SUBROUTINE transpose_xz( f_in, work, f_out )
175
176	!------------------------------------------------------------------------------!
177	! Description:
178	! ------------
179	! Transposition of input array (f_in) from x to z. For the input array, all
180	! elements along x reside on the same PE, while after transposition, all
181	! elements along z reside on the same PE.
182	!------------------------------------------------------------------------------!
183
184	USE cpulog
185	USE indices
186	USE interfaces
187	USE pegrid
188	USE transpose_indices
189
190	IMPLICIT NONE
191
192	INTEGER :: i, j, k, l, xs
193
194	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(1:nz,nys:nyn,nxl:nxr)
195
196	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
197
198	!$acc declare create( f_inv )
199	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
200
201
202	!
203	!-- If the PE grid is one-dimensional along y, the array has only to be
204	!-- reordered locally and therefore no transposition has to be done.
205	IF ( pdims(1) /= 1 ) THEN
206
207	#if defined( __parallel )
208	!
209	!-- Reorder input array for transposition
210	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
211	!$OMP DO
212	DO l = 0, pdims(1) - 1
213	xs = 0 + l * nnx
214	!$acc kernels present( f_in, work )
215	!$acc loop
216	DO k = nzb_x, nzt_x
217	DO i = xs, xs + nnx - 1
218	!$acc loop vector( 32 )
219	DO j = nys_x, nyn_x
220	work(j,i-xs+1,k,l) = f_in(i,j,k)
221	ENDDO
222	ENDDO
223	ENDDO
224	!$acc end kernels
225	ENDDO
226	!$OMP END PARALLEL
227
228	!
229	!-- Transpose array
230	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
231	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
232	!$acc update host( work )
233	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
234	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
235	comm1dx, ierr )
236	!$acc update device( f_inv )
237	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
238
239	!
240	!-- Reorder transposed array in a way that the z index is in first position
241	!$OMP PARALLEL PRIVATE ( i, j, k )
242	!$OMP DO
243	!$acc kernels present( f_out )
244	!$acc loop
245	DO k = 1, nz
246	DO i = nxl, nxr
247	!$acc loop vector( 32 )
248	DO j = nys, nyn
249	f_out(k,j,i) = f_inv(j,i,k)
250	ENDDO
251	ENDDO
252	ENDDO
253	!$acc end kernels
254	!$OMP END PARALLEL
255	#endif
256
257	ELSE
258
259	!
260	!-- Reorder the array in a way that the z index is in first position
261	!$OMP PARALLEL PRIVATE ( i, j, k )
262	!$OMP DO
263	!$acc kernels present( f_in )
264	!$acc loop
265	DO i = nxl, nxr
266	DO j = nys, nyn
267	!$acc loop vector( 32 )
268	DO k = 1, nz
269	f_inv(j,i,k) = f_in(i,j,k)
270	ENDDO
271	ENDDO
272	ENDDO
273	!$acc end kernels
274	!$OMP END PARALLEL
275
276	!$OMP PARALLEL PRIVATE ( i, j, k )
277	!$OMP DO
278	!$acc kernels present( f_out )
279	!$acc loop
280	DO k = 1, nz
281	DO i = nxl, nxr
282	!$acc loop vector( 32 )
283	DO j = nys, nyn
284	f_out(k,j,i) = f_inv(j,i,k)
285	ENDDO
286	ENDDO
287	ENDDO
288	!$acc end kernels
289	!$OMP END PARALLEL
290
291	ENDIF
292
293	END SUBROUTINE transpose_xz
294
295
296	SUBROUTINE transpose_yx( f_in, work, f_out )
297
298	!------------------------------------------------------------------------------!
299	! Description:
300	! ------------
301	! Transposition of input array (f_in) from y to x. For the input array, all
302	! elements along y reside on the same PE, while after transposition, all
303	! elements along x reside on the same PE.
304	!------------------------------------------------------------------------------!
305
306	USE cpulog
307	USE indices
308	USE interfaces
309	USE pegrid
310	USE transpose_indices
311
312	IMPLICIT NONE
313
314	INTEGER :: i, j, k, l, ys
315
316	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
317
318	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
319
320	!$acc declare create( f_inv )
321	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
322
323
324	IF ( numprocs /= 1 ) THEN
325
326	#if defined( __parallel )
327	!
328	!-- Reorder input array for transposition
329	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
330	!$OMP DO
331	DO l = 0, pdims(2) - 1
332	ys = 0 + l * ( nyn_x - nys_x + 1 )
333	!$acc kernels present( f_in, work )
334	!$acc loop
335	DO i = nxl_y, nxr_y
336	DO k = nzb_y, nzt_y
337	!$acc loop vector( 32 )
338	DO j = ys, ys + nyn_x - nys_x
339	work(j-ys+1,k,i,l) = f_in(j,i,k)
340	ENDDO
341	ENDDO
342	ENDDO
343	!$acc end kernels
344	ENDDO
345	!$OMP END PARALLEL
346
347	!
348	!-- Transpose array
349	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
350	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
351	!$acc update host( work )
352	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
353	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
354	comm1dy, ierr )
355	!$acc update device( f_inv )
356	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
357	#endif
358
359	ELSE
360
361	!
362	!-- Reorder array f_in the same way as ALLTOALL did it
363	!$OMP PARALLEL PRIVATE ( i, j, k )
364	!$OMP DO
365	!$acc kernels present( f_in )
366	!$acc loop
367	DO i = nxl_y, nxr_y
368	DO k = nzb_y, nzt_y
369	!$acc loop vector( 32 )
370	DO j = 0, ny
371	f_inv(j,k,i) = f_in(j,i,k)
372	ENDDO
373	ENDDO
374	ENDDO
375	!$acc end kernels
376	!$OMP END PARALLEL
377
378	ENDIF
379
380	!
381	!-- Reorder transposed array in a way that the x index is in first position
382	!$OMP PARALLEL PRIVATE ( i, j, k )
383	!$OMP DO
384	!$acc kernels present( f_out )
385	!$acc loop
386	DO i = 0, nx
387	DO k = nzb_x, nzt_x
388	!$acc loop vector( 32 )
389	DO j = nys_x, nyn_x
390	f_out(i,j,k) = f_inv(j,k,i)
391	ENDDO
392	ENDDO
393	ENDDO
394	!$acc end kernels
395	!$OMP END PARALLEL
396
397	END SUBROUTINE transpose_yx
398
399
400	SUBROUTINE transpose_yxd( f_in, work, f_out )
401
402	!------------------------------------------------------------------------------!
403	! Description:
404	! ------------
405	! Transposition of input array (f_in) from y to x. For the input array, all
406	! elements along y reside on the same PE, while after transposition, all
407	! elements along x reside on the same PE.
408	! This is a direct transposition for arrays with indices in regular order
409	! (k,j,i) (cf. transpose_yx).
410	!------------------------------------------------------------------------------!
411
412	USE cpulog
413	USE indices
414	USE interfaces
415	USE pegrid
416	USE transpose_indices
417
418	IMPLICIT NONE
419
420	INTEGER :: i, j, k, l, m, xs
421
422	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
423	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
424	work(nnxnnynnz)
425
426	#if defined( __parallel )
427
428	!
429	!-- Rearrange indices of input array in order to make data to be send
430	!-- by MPI contiguous
431	DO k = 1, nz
432	DO j = nys, nyn
433	DO i = nxl, nxr
434	f_inv(i,k,j) = f_in(k,j,i)
435	ENDDO
436	ENDDO
437	ENDDO
438
439	!
440	!-- Transpose array
441	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
442	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
443	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
444	work(1), sendrecvcount_xy, MPI_REAL, &
445	comm1dx, ierr )
446	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
447
448	!
449	!-- Reorder transposed array
450	m = 0
451	DO l = 0, pdims(1) - 1
452	xs = 0 + l * nnx
453	DO j = nys_x, nyn_x
454	DO k = 1, nz
455	DO i = xs, xs + nnx - 1
456	m = m + 1
457	f_out(i,j,k) = work(m)
458	ENDDO
459	ENDDO
460	ENDDO
461	ENDDO
462
463	#endif
464
465	END SUBROUTINE transpose_yxd
466
467
468	SUBROUTINE transpose_yz( f_in, work, f_out )
469
470	!------------------------------------------------------------------------------!
471	! Description:
472	! ------------
473	! Transposition of input array (f_in) from y to z. For the input array, all
474	! elements along y reside on the same PE, while after transposition, all
475	! elements along z reside on the same PE.
476	!------------------------------------------------------------------------------!
477
478	USE cpulog
479	USE indices
480	USE interfaces
481	USE pegrid
482	USE transpose_indices
483
484	IMPLICIT NONE
485
486	INTEGER :: i, j, k, l, zs
487
488	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
489
490	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
491
492	!$acc declare create( f_inv )
493	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
494
495
496	!
497	!-- Rearrange indices of input array in order to make data to be send
498	!-- by MPI contiguous
499	!$OMP PARALLEL PRIVATE ( i, j, k )
500	!$OMP DO
501	!$acc kernels present( f_in )
502	!$acc loop
503	DO j = 0, ny
504	DO k = nzb_y, nzt_y
505	!$acc loop vector( 32 )
506	DO i = nxl_y, nxr_y
507	f_inv(i,k,j) = f_in(j,i,k)
508	ENDDO
509	ENDDO
510	ENDDO
511	!$acc end kernels
512	!$OMP END PARALLEL
513
514	!
515	!-- Move data to different array, because memory location of work1 is
516	!-- needed further below (work1 = work2).
517	!-- If the PE grid is one-dimensional along y, only local reordering
518	!-- of the data is necessary and no transposition has to be done.
519	IF ( pdims(1) == 1 ) THEN
520
521	!$OMP PARALLEL PRIVATE ( i, j, k )
522	!$OMP DO
523	!$acc kernels present( f_out )
524	!$acc loop
525	DO j = 0, ny
526	DO k = nzb_y, nzt_y
527	!$acc loop vector( 32 )
528	DO i = nxl_y, nxr_y
529	f_out(i,j,k) = f_inv(i,k,j)
530	ENDDO
531	ENDDO
532	ENDDO
533	!$acc end kernels
534	!$OMP END PARALLEL
535
536	ELSE
537
538	#if defined( __parallel )
539	!
540	!-- Transpose array
541	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
542	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
543	!$acc update host( f_inv )
544	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
545	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
546	comm1dx, ierr )
547	!$acc update device( work )
548	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
549
550	!
551	!-- Reorder transposed array
552	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
553	!$OMP DO
554	DO l = 0, pdims(1) - 1
555	zs = 1 + l * ( nzt_y - nzb_y + 1 )
556	!$acc kernels present( f_out, work )
557	!$acc loop
558	DO j = nys_z, nyn_z
559	DO k = zs, zs + nzt_y - nzb_y
560	!$acc loop vector( 32 )
561	DO i = nxl_z, nxr_z
562	f_out(i,j,k) = work(i,k-zs+1,j,l)
563	ENDDO
564	ENDDO
565	ENDDO
566	!$acc end kernels
567	ENDDO
568	!$OMP END PARALLEL
569	#endif
570
571	ENDIF
572
573	END SUBROUTINE transpose_yz
574
575
576	SUBROUTINE transpose_zx( f_in, work, f_out )
577
578	!------------------------------------------------------------------------------!
579	! Description:
580	! ------------
581	! Transposition of input array (f_in) from z to x. For the input array, all
582	! elements along z reside on the same PE, while after transposition, all
583	! elements along x reside on the same PE.
584	!------------------------------------------------------------------------------!
585
586	USE cpulog
587	USE indices
588	USE interfaces
589	USE pegrid
590	USE transpose_indices
591
592	IMPLICIT NONE
593
594	INTEGER :: i, j, k, l, xs
595
596	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
597
598	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
599
600	!$acc declare create( f_inv )
601	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
602
603
604	!
605	!-- Rearrange indices of input array in order to make data to be send
606	!-- by MPI contiguous
607	!$OMP PARALLEL PRIVATE ( i, j, k )
608	!$OMP DO
609	!$acc kernels present( f_in )
610	!$acc loop
611	DO k = 1,nz
612	DO i = nxl, nxr
613	!$acc loop vector( 32 )
614	DO j = nys, nyn
615	f_inv(j,i,k) = f_in(k,j,i)
616	ENDDO
617	ENDDO
618	ENDDO
619	!$acc end kernels
620	!$OMP END PARALLEL
621
622	!
623	!-- Move data to different array, because memory location of work1 is
624	!-- needed further below (work1 = work2).
625	!-- If the PE grid is one-dimensional along y, only local reordering
626	!-- of the data is necessary and no transposition has to be done.
627	IF ( pdims(1) == 1 ) THEN
628
629	!$OMP PARALLEL PRIVATE ( i, j, k )
630	!$OMP DO
631	!$acc kernels present( f_out )
632	!$acc loop
633	DO k = 1, nz
634	DO i = nxl, nxr
635	!$acc loop vector( 32 )
636	DO j = nys, nyn
637	f_out(i,j,k) = f_inv(j,i,k)
638	ENDDO
639	ENDDO
640	ENDDO
641	!$acc end kernels
642	!$OMP END PARALLEL
643
644	ELSE
645
646	#if defined( __parallel )
647	!
648	!-- Transpose array
649	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
650	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
651	!$acc update host( f_inv )
652	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
653	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
654	comm1dx, ierr )
655	!$acc update device( work )
656	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
657
658	!
659	!-- Reorder transposed array
660	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
661	!$OMP DO
662	DO l = 0, pdims(1) - 1
663	xs = 0 + l * nnx
664	!$acc kernels present( f_out, work )
665	!$acc loop
666	DO k = nzb_x, nzt_x
667	DO i = xs, xs + nnx - 1
668	!$acc loop vector( 32 )
669	DO j = nys_x, nyn_x
670	f_out(i,j,k) = work(j,i-xs+1,k,l)
671	ENDDO
672	ENDDO
673	ENDDO
674	!$acc end kernels
675	ENDDO
676	!$OMP END PARALLEL
677	#endif
678
679	ENDIF
680
681	END SUBROUTINE transpose_zx
682
683
684	SUBROUTINE transpose_zy( f_in, work, f_out )
685
686	!------------------------------------------------------------------------------!
687	! Description:
688	! ------------
689	! Transposition of input array (f_in) from z to y. For the input array, all
690	! elements along z reside on the same PE, while after transposition, all
691	! elements along y reside on the same PE.
692	!------------------------------------------------------------------------------!
693
694	USE cpulog
695	USE indices
696	USE interfaces
697	USE pegrid
698	USE transpose_indices
699
700	IMPLICIT NONE
701
702	INTEGER :: i, j, k, l, zs
703
704	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
705
706	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
707
708	!$acc declare create( f_inv )
709	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
710
711
712	!
713	!-- If the PE grid is one-dimensional along y, the array has only to be
714	!-- reordered locally and therefore no transposition has to be done.
715	IF ( pdims(1) /= 1 ) THEN
716
717	#if defined( __parallel )
718	!
719	!-- Reorder input array for transposition
720	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
721	!$OMP DO
722	DO l = 0, pdims(1) - 1
723	zs = 1 + l * ( nzt_y - nzb_y + 1 )
724	!$acc kernels present( f_in, work )
725	!$acc loop
726	DO j = nys_z, nyn_z
727	DO k = zs, zs + nzt_y - nzb_y
728	!$acc loop vector( 32 )
729	DO i = nxl_z, nxr_z
730	work(i,k-zs+1,j,l) = f_in(i,j,k)
731	ENDDO
732	ENDDO
733	ENDDO
734	!$acc end kernels
735	ENDDO
736	!$OMP END PARALLEL
737
738	!
739	!-- Transpose array
740	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
741	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
742	!$acc update host( work )
743	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
744	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
745	comm1dx, ierr )
746	!$acc update device( f_inv )
747	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
748	#endif
749
750	ELSE
751	!
752	!-- Reorder the array in the same way like ALLTOALL did it
753	!$OMP PARALLEL PRIVATE ( i, j, k )
754	!$OMP DO
755	!$acc kernels present( f_in )
756	!$acc loop
757	DO k = nzb_y, nzt_y
758	DO j = 0, ny
759	!$acc loop vector( 32 )
760	DO i = nxl_y, nxr_y
761	f_inv(i,k,j) = f_in(i,j,k)
762	ENDDO
763	ENDDO
764	ENDDO
765	!$acc end kernels
766	!$OMP END PARALLEL
767
768	ENDIF
769
770	!
771	!-- Reorder transposed array in a way that the y index is in first position
772	!$OMP PARALLEL PRIVATE ( i, j, k )
773	!$OMP DO
774	!$acc kernels present( f_out )
775	!$acc loop
776	DO k = nzb_y, nzt_y
777	DO i = nxl_y, nxr_y
778	!$acc loop vector( 32 )
779	DO j = 0, ny
780	f_out(j,i,k) = f_inv(i,k,j)
781	ENDDO
782	ENDDO
783	ENDDO
784	!$acc end kernels
785	!$OMP END PARALLEL
786
787	END SUBROUTINE transpose_zy
788
789
790	SUBROUTINE transpose_zyd( f_in, work, f_out )
791
792	!------------------------------------------------------------------------------!
793	! Description:
794	! ------------
795	! Transposition of input array (f_in) from z to y. For the input array, all
796	! elements along z reside on the same PE, while after transposition, all
797	! elements along y reside on the same PE.
798	! This is a direct transposition for arrays with indices in regular order
799	! (k,j,i) (cf. transpose_zy).
800	!------------------------------------------------------------------------------!
801
802	USE cpulog
803	USE indices
804	USE interfaces
805	USE pegrid
806	USE transpose_indices
807
808	IMPLICIT NONE
809
810	INTEGER :: i, j, k, l, m, ys
811
812	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
813	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
814	work(nnxnnynnz)
815
816	#if defined( __parallel )
817
818	!
819	!-- Rearrange indices of input array in order to make data to be send
820	!-- by MPI contiguous
821	DO i = nxl, nxr
822	DO j = nys, nyn
823	DO k = 1, nz
824	f_inv(j,i,k) = f_in(k,j,i)
825	ENDDO
826	ENDDO
827	ENDDO
828
829	!
830	!-- Move data to different array, because memory location of work1 is
831	!-- needed further below (work1 = work2).
832	!-- If the PE grid is one-dimensional along x, only local reordering
833	!-- of the data is necessary and no transposition has to be done.
834	IF ( pdims(2) == 1 ) THEN
835	DO k = 1, nz
836	DO i = nxl, nxr
837	DO j = nys, nyn
838	f_out(j,i,k) = f_inv(j,i,k)
839	ENDDO
840	ENDDO
841	ENDDO
842	RETURN
843	ENDIF
844
845	!
846	!-- Transpose array
847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
848	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
849	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
850	work(1), sendrecvcount_zyd, MPI_REAL, &
851	comm1dy, ierr )
852	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
853
854	!
855	!-- Reorder transposed array
856	m = 0
857	DO l = 0, pdims(2) - 1
858	ys = 0 + l * nny
859	DO k = nzb_yd, nzt_yd
860	DO i = nxl_yd, nxr_yd
861	DO j = ys, ys + nny - 1
862	m = m + 1
863	f_out(j,i,k) = work(m)
864	ENDDO
865	ENDDO
866	ENDDO
867	ENDDO
868
869	#endif
870
871	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |