Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1216

Last change on this file since 1216 was 1216, checked in by raasch, 11 years ago
overlapping execution of fft and transpositions (MPI_ALLTOALL), but real overlapping is not activated so far, fftw implemented for 1D-decomposition resorting of arrays moved to separate routines resort_for_... bugfix in mbuild concerning Makefile_check
Property svn:keywords set to `Id`
File size: 26.8 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! re-sorting of the transposed / to be transposed arrays moved to separate
23	! routines resort_for_...
24	!
25	! Former revisions:
26	! -----------------
27	! $Id: transpose.f90 1216 2013-08-26 09:31:42Z raasch $
28	!
29	! 1111 2013-03-08 23:54:10Z raasch
30	! openACC directives added,
31	! resorting data from/to work changed, work got 4 dimensions instead of 1
32	!
33	! 1106 2013-03-04 05:31:38Z raasch
34	! preprocessor lines rearranged so that routines can also be used in serial
35	! (non-parallel) mode
36	!
37	! 1092 2013-02-02 11:24:22Z raasch
38	! unused variables removed
39	!
40	! 1036 2012-10-22 13:43:42Z raasch
41	! code put under GPL (PALM 3.9)
42	!
43	! 1003 2012-09-14 14:35:53Z raasch
44	! indices nxa, nya, etc. replaced by nx, ny, etc.
45	!
46	! 683 2011-02-09 14:25:15Z raasch
47	! openMP parallelization of transpositions for 2d-domain-decomposition
48	!
49	! 622 2010-12-10 08:08:13Z raasch
50	! optional barriers included in order to speed up collective operations
51	!
52	! 164 2008-05-15 08:46:15Z raasch
53	! f_inv changed from subroutine argument to automatic array in order to do
54	! re-ordering from f_in to f_inv in one step, one array work is needed instead
55	! of work1 and work2
56	!
57	! February 2007
58	! RCS Log replace by Id keyword, revision history cleaned up
59	!
60	! Revision 1.2 2004/04/30 13:12:17 raasch
61	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
62	! all former transpose-routine files collected in this file, enlarged
63	! transposition arrays introduced
64	!
65	! Revision 1.1 2004/04/30 13:08:16 raasch
66	! Initial revision (collection of former routines transpose_xy, transpose_xz,
67	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
68	!
69	! Revision 1.1 1997/07/24 11:25:18 raasch
70	! Initial revision
71	!
72	!------------------------------------------------------------------------------!
73	! Description:
74	! ------------
75	! Resorting data for the transposition from x to y. The transposition itself
76	! is carried out in transpose_xy
77	!------------------------------------------------------------------------------!
78
79	USE indices
80	USE transpose_indices
81
82	IMPLICIT NONE
83
84	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
85	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
86
87
88	INTEGER :: i, j, k
89
90	!
91	!-- Rearrange indices of input array in order to make data to be send
92	!-- by MPI contiguous
93	!$OMP PARALLEL PRIVATE ( i, j, k )
94	!$OMP DO
95	!$acc kernels present( f_in, f_inv )
96	!$acc loop
97	DO i = 0, nx
98	DO k = nzb_x, nzt_x
99	!$acc loop vector( 32 )
100	DO j = nys_x, nyn_x
101	f_inv(j,k,i) = f_in(i,j,k)
102	ENDDO
103	ENDDO
104	ENDDO
105	!$acc end kernels
106	!$OMP END PARALLEL
107
108	END SUBROUTINE resort_for_xy
109
110
111	SUBROUTINE transpose_xy( f_inv, f_out )
112
113	!------------------------------------------------------------------------------!
114	! Description:
115	! ------------
116	! Transposition of input array (f_in) from x to y. For the input array, all
117	! elements along x reside on the same PE, while after transposition, all
118	! elements along y reside on the same PE.
119	!------------------------------------------------------------------------------!
120
121	USE cpulog
122	USE indices
123	USE interfaces
124	USE pegrid
125	USE transpose_indices
126
127	IMPLICIT NONE
128
129	INTEGER :: i, j, k, l, ys
130
131	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
132
133	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
134
135
136	IF ( numprocs /= 1 ) THEN
137
138	#if defined( __parallel )
139	!
140	!-- Transpose array
141	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
142	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
143	!$acc update host( f_inv )
144	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
145	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
146	comm1dy, ierr )
147	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
148
149	!
150	!-- Reorder transposed array
151	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
152	!$OMP DO
153	!$acc data copyin( work )
154	DO l = 0, pdims(2) - 1
155	ys = 0 + l * ( nyn_x - nys_x + 1 )
156	!$acc kernels present( f_out, work )
157	!$acc loop
158	DO i = nxl_y, nxr_y
159	DO k = nzb_y, nzt_y
160	!$acc loop vector( 32 )
161	DO j = ys, ys + nyn_x - nys_x
162	f_out(j,i,k) = work(j-ys+1,k,i,l)
163	ENDDO
164	ENDDO
165	ENDDO
166	!$acc end kernels
167	ENDDO
168	!$acc end data
169	!$OMP END PARALLEL
170	#endif
171
172	ELSE
173
174	!
175	!-- Reorder transposed array
176	!$OMP PARALLEL PRIVATE ( i, j, k )
177	!$OMP DO
178	!$acc kernels present( f_inv, f_out )
179	!$acc loop
180	DO k = nzb_y, nzt_y
181	DO i = nxl_y, nxr_y
182	!$acc loop vector( 32 )
183	DO j = 0, ny
184	f_out(j,i,k) = f_inv(j,k,i)
185	ENDDO
186	ENDDO
187	ENDDO
188	!$acc end kernels
189	!$OMP END PARALLEL
190
191	ENDIF
192
193	END SUBROUTINE transpose_xy
194
195
196	SUBROUTINE resort_for_xz( f_inv, f_out )
197
198	!------------------------------------------------------------------------------!
199	! Description:
200	! ------------
201	! Resorting data after the transposition from x to z. The transposition itself
202	! is carried out in transpose_xz
203	!------------------------------------------------------------------------------!
204
205	USE indices
206	USE transpose_indices
207
208	IMPLICIT NONE
209
210	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
211	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
212
213
214	INTEGER :: i, j, k
215
216	!
217	!-- Rearrange indices of input array in order to make data to be send
218	!-- by MPI contiguous.
219	!-- In case of parallel fft/transposition, scattered store is faster in
220	!-- backward direction!!!
221	!$OMP PARALLEL PRIVATE ( i, j, k )
222	!$OMP DO
223	!$acc kernels present( f_inv, f_out )
224	!$acc loop
225	DO k = 1, nz
226	DO i = nxl, nxr
227	!$acc loop vector( 32 )
228	DO j = nys, nyn
229	f_out(k,j,i) = f_inv(j,i,k)
230	ENDDO
231	ENDDO
232	ENDDO
233	!$acc end kernels
234	!$OMP END PARALLEL
235
236	END SUBROUTINE resort_for_xz
237
238
239	SUBROUTINE transpose_xz( f_in, f_inv )
240
241	!------------------------------------------------------------------------------!
242	! Description:
243	! ------------
244	! Transposition of input array (f_in) from x to z. For the input array, all
245	! elements along x reside on the same PE, while after transposition, all
246	! elements along z reside on the same PE.
247	!------------------------------------------------------------------------------!
248
249	USE cpulog
250	USE indices
251	USE interfaces
252	USE pegrid
253	USE transpose_indices
254
255	IMPLICIT NONE
256
257	INTEGER :: i, j, k, l, xs
258
259	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
260
261	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
262
263
264	!
265	!-- If the PE grid is one-dimensional along y, the array has only to be
266	!-- reordered locally and therefore no transposition has to be done.
267	IF ( pdims(1) /= 1 ) THEN
268
269	#if defined( __parallel )
270	!
271	!-- Reorder input array for transposition
272	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
273	!$OMP DO
274	!$acc data copyout( work )
275	DO l = 0, pdims(1) - 1
276	xs = 0 + l * nnx
277	!$acc kernels present( f_in, work )
278	!$acc loop
279	DO k = nzb_x, nzt_x
280	DO i = xs, xs + nnx - 1
281	!$acc loop vector( 32 )
282	DO j = nys_x, nyn_x
283	work(j,i-xs+1,k,l) = f_in(i,j,k)
284	ENDDO
285	ENDDO
286	ENDDO
287	!$acc end kernels
288	ENDDO
289	!$acc end data
290	!$OMP END PARALLEL
291
292	!
293	!-- Transpose array
294	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
295	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
296	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
297	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
298	comm1dx, ierr )
299	!$acc update device( f_inv )
300	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
301	#endif
302
303	ELSE
304
305	!
306	!-- Reorder the array in a way that the z index is in first position
307	!$OMP PARALLEL PRIVATE ( i, j, k )
308	!$OMP DO
309	!$acc kernels present( f_in, f_inv )
310	!$acc loop
311	DO i = nxl, nxr
312	DO j = nys, nyn
313	!$acc loop vector( 32 )
314	DO k = 1, nz
315	f_inv(j,i,k) = f_in(i,j,k)
316	ENDDO
317	ENDDO
318	ENDDO
319	!$acc end kernels
320	!$OMP END PARALLEL
321
322	ENDIF
323
324	END SUBROUTINE transpose_xz
325
326
327	SUBROUTINE resort_for_yx( f_inv, f_out )
328
329	!------------------------------------------------------------------------------!
330	! Description:
331	! ------------
332	! Resorting data after the transposition from y to x. The transposition itself
333	! is carried out in transpose_yx
334	!------------------------------------------------------------------------------!
335
336	USE indices
337	USE transpose_indices
338
339	IMPLICIT NONE
340
341	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
342	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
343
344
345	INTEGER :: i, j, k
346
347	!
348	!-- Rearrange indices of input array in order to make data to be send
349	!-- by MPI contiguous
350	!$OMP PARALLEL PRIVATE ( i, j, k )
351	!$OMP DO
352	!$acc kernels present( f_inv, f_out )
353	!$acc loop
354	DO i = 0, nx
355	DO k = nzb_x, nzt_x
356	!$acc loop vector( 32 )
357	DO j = nys_x, nyn_x
358	f_out(i,j,k) = f_inv(j,k,i)
359	ENDDO
360	ENDDO
361	ENDDO
362	!$acc end kernels
363	!$OMP END PARALLEL
364
365	END SUBROUTINE resort_for_yx
366
367
368	SUBROUTINE transpose_yx( f_in, f_inv )
369
370	!------------------------------------------------------------------------------!
371	! Description:
372	! ------------
373	! Transposition of input array (f_in) from y to x. For the input array, all
374	! elements along y reside on the same PE, while after transposition, all
375	! elements along x reside on the same PE.
376	!------------------------------------------------------------------------------!
377
378	USE cpulog
379	USE indices
380	USE interfaces
381	USE pegrid
382	USE transpose_indices
383
384	IMPLICIT NONE
385
386	INTEGER :: i, j, k, l, ys
387
388	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
389
390	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
391
392
393	IF ( numprocs /= 1 ) THEN
394
395	#if defined( __parallel )
396	!
397	!-- Reorder input array for transposition
398	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
399	!$OMP DO
400	!$acc data copyout( work )
401	DO l = 0, pdims(2) - 1
402	ys = 0 + l * ( nyn_x - nys_x + 1 )
403	!$acc kernels present( f_in, work )
404	!$acc loop
405	DO i = nxl_y, nxr_y
406	DO k = nzb_y, nzt_y
407	!$acc loop vector( 32 )
408	DO j = ys, ys + nyn_x - nys_x
409	work(j-ys+1,k,i,l) = f_in(j,i,k)
410	ENDDO
411	ENDDO
412	ENDDO
413	!$acc end kernels
414	ENDDO
415	!$acc end data
416	!$OMP END PARALLEL
417
418	!
419	!-- Transpose array
420	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
421	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
422	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
423	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
424	comm1dy, ierr )
425	!$acc update device( f_inv )
426	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
427	#endif
428
429	ELSE
430
431	!
432	!-- Reorder array f_in the same way as ALLTOALL did it
433	!$OMP PARALLEL PRIVATE ( i, j, k )
434	!$OMP DO
435	!$acc kernels present( f_in, f_inv )
436	!$acc loop
437	DO i = nxl_y, nxr_y
438	DO k = nzb_y, nzt_y
439	!$acc loop vector( 32 )
440	DO j = 0, ny
441	f_inv(j,k,i) = f_in(j,i,k)
442	ENDDO
443	ENDDO
444	ENDDO
445	!$acc end kernels
446	!$OMP END PARALLEL
447
448	ENDIF
449
450	END SUBROUTINE transpose_yx
451
452
453	SUBROUTINE transpose_yxd( f_in, f_out )
454
455	!------------------------------------------------------------------------------!
456	! Description:
457	! ------------
458	! Transposition of input array (f_in) from y to x. For the input array, all
459	! elements along y reside on the same PE, while after transposition, all
460	! elements along x reside on the same PE.
461	! This is a direct transposition for arrays with indices in regular order
462	! (k,j,i) (cf. transpose_yx).
463	!------------------------------------------------------------------------------!
464
465	USE cpulog
466	USE indices
467	USE interfaces
468	USE pegrid
469	USE transpose_indices
470
471	IMPLICIT NONE
472
473	INTEGER :: i, j, k, l, m, xs
474
475	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
476	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
477	work(nnxnnynnz)
478
479	#if defined( __parallel )
480
481	!
482	!-- Rearrange indices of input array in order to make data to be send
483	!-- by MPI contiguous
484	DO k = 1, nz
485	DO j = nys, nyn
486	DO i = nxl, nxr
487	f_inv(i,k,j) = f_in(k,j,i)
488	ENDDO
489	ENDDO
490	ENDDO
491
492	!
493	!-- Transpose array
494	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
495	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
496	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
497	work(1), sendrecvcount_xy, MPI_REAL, &
498	comm1dx, ierr )
499	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
500
501	!
502	!-- Reorder transposed array
503	m = 0
504	DO l = 0, pdims(1) - 1
505	xs = 0 + l * nnx
506	DO j = nys_x, nyn_x
507	DO k = 1, nz
508	DO i = xs, xs + nnx - 1
509	m = m + 1
510	f_out(i,j,k) = work(m)
511	ENDDO
512	ENDDO
513	ENDDO
514	ENDDO
515
516	#endif
517
518	END SUBROUTINE transpose_yxd
519
520
521	SUBROUTINE resort_for_yz( f_in, f_inv )
522
523	!------------------------------------------------------------------------------!
524	! Description:
525	! ------------
526	! Resorting data for the transposition from y to z. The transposition itself
527	! is carried out in transpose_yz
528	!------------------------------------------------------------------------------!
529
530	USE indices
531	USE transpose_indices
532
533	IMPLICIT NONE
534
535	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
536	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
537
538
539	INTEGER :: i, j, k
540
541	!
542	!-- Rearrange indices of input array in order to make data to be send
543	!-- by MPI contiguous
544	!$OMP PARALLEL PRIVATE ( i, j, k )
545	!$OMP DO
546	!$acc kernels present( f_in, f_inv )
547	!$acc loop
548	DO j = 0, ny
549	DO k = nzb_y, nzt_y
550	!$acc loop vector( 32 )
551	DO i = nxl_y, nxr_y
552	f_inv(i,k,j) = f_in(j,i,k)
553	ENDDO
554	ENDDO
555	ENDDO
556	!$acc end kernels
557	!$OMP END PARALLEL
558
559	END SUBROUTINE resort_for_yz
560
561
562	SUBROUTINE transpose_yz( f_inv, f_out )
563
564	!------------------------------------------------------------------------------!
565	! Description:
566	! ------------
567	! Transposition of input array (f_in) from y to z. For the input array, all
568	! elements along y reside on the same PE, while after transposition, all
569	! elements along z reside on the same PE.
570	!------------------------------------------------------------------------------!
571
572	USE cpulog
573	USE indices
574	USE interfaces
575	USE pegrid
576	USE transpose_indices
577
578	IMPLICIT NONE
579
580	INTEGER :: i, j, k, l, zs
581
582	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
583
584	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
585
586
587	!
588	!-- If the PE grid is one-dimensional along y, only local reordering
589	!-- of the data is necessary and no transposition has to be done.
590	IF ( pdims(1) == 1 ) THEN
591
592	!$OMP PARALLEL PRIVATE ( i, j, k )
593	!$OMP DO
594	!$acc kernels present( f_inv, f_out )
595	!$acc loop
596	DO j = 0, ny
597	DO k = nzb_y, nzt_y
598	!$acc loop vector( 32 )
599	DO i = nxl_y, nxr_y
600	f_out(i,j,k) = f_inv(i,k,j)
601	ENDDO
602	ENDDO
603	ENDDO
604	!$acc end kernels
605	!$OMP END PARALLEL
606
607	ELSE
608
609	#if defined( __parallel )
610	!
611	!-- Transpose array
612	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
613	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
614	!$acc update host( f_inv )
615	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
616	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
617	comm1dx, ierr )
618	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
619
620	!
621	!-- Reorder transposed array
622	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
623	!$OMP DO
624	!$acc data copyin( work )
625	DO l = 0, pdims(1) - 1
626	zs = 1 + l * ( nzt_y - nzb_y + 1 )
627	!$acc kernels present( f_out )
628	!$acc loop
629	DO j = nys_z, nyn_z
630	DO k = zs, zs + nzt_y - nzb_y
631	!$acc loop vector( 32 )
632	DO i = nxl_z, nxr_z
633	f_out(i,j,k) = work(i,k-zs+1,j,l)
634	ENDDO
635	ENDDO
636	ENDDO
637	!$acc end kernels
638	ENDDO
639	!$acc end data
640	!$OMP END PARALLEL
641	#endif
642
643	ENDIF
644
645	END SUBROUTINE transpose_yz
646
647
648	SUBROUTINE resort_for_zx( f_in, f_inv )
649
650	!------------------------------------------------------------------------------!
651	! Description:
652	! ------------
653	! Resorting data for the transposition from z to x. The transposition itself
654	! is carried out in transpose_zx
655	!------------------------------------------------------------------------------!
656
657	USE indices
658	USE transpose_indices
659
660	IMPLICIT NONE
661
662	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
663	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
664
665
666	INTEGER :: i, j, k
667
668	!
669	!-- Rearrange indices of input array in order to make data to be send
670	!-- by MPI contiguous
671	!$OMP PARALLEL PRIVATE ( i, j, k )
672	!$OMP DO
673	!$acc kernels present( f_in, f_inv )
674	!$acc loop
675	DO k = 1,nz
676	DO i = nxl, nxr
677	!$acc loop vector( 32 )
678	DO j = nys, nyn
679	f_inv(j,i,k) = f_in(k,j,i)
680	ENDDO
681	ENDDO
682	ENDDO
683	!$acc end kernels
684	!$OMP END PARALLEL
685
686	END SUBROUTINE resort_for_zx
687
688
689	SUBROUTINE transpose_zx( f_inv, f_out )
690
691	!------------------------------------------------------------------------------!
692	! Description:
693	! ------------
694	! Transposition of input array (f_in) from z to x. For the input array, all
695	! elements along z reside on the same PE, while after transposition, all
696	! elements along x reside on the same PE.
697	!------------------------------------------------------------------------------!
698
699	USE cpulog
700	USE indices
701	USE interfaces
702	USE pegrid
703	USE transpose_indices
704
705	IMPLICIT NONE
706
707	INTEGER :: i, j, k, l, xs
708
709	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
710
711	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
712
713
714	!
715	!-- If the PE grid is one-dimensional along y, only local reordering
716	!-- of the data is necessary and no transposition has to be done.
717	IF ( pdims(1) == 1 ) THEN
718
719	!$OMP PARALLEL PRIVATE ( i, j, k )
720	!$OMP DO
721	!$acc kernels present( f_inv, f_out )
722	!$acc loop
723	DO k = 1, nz
724	DO i = nxl, nxr
725	!$acc loop vector( 32 )
726	DO j = nys, nyn
727	f_out(i,j,k) = f_inv(j,i,k)
728	ENDDO
729	ENDDO
730	ENDDO
731	!$acc end kernels
732	!$OMP END PARALLEL
733
734	ELSE
735
736	#if defined( __parallel )
737	!
738	!-- Transpose array
739	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
740	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
741	!$acc update host( f_inv )
742	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
743	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
744	comm1dx, ierr )
745	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
746
747	!
748	!-- Reorder transposed array
749	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
750	!$OMP DO
751	!$acc data copyin( work )
752	DO l = 0, pdims(1) - 1
753	xs = 0 + l * nnx
754	!$acc kernels present( f_out )
755	!$acc loop
756	DO k = nzb_x, nzt_x
757	DO i = xs, xs + nnx - 1
758	!$acc loop vector( 32 )
759	DO j = nys_x, nyn_x
760	f_out(i,j,k) = work(j,i-xs+1,k,l)
761	ENDDO
762	ENDDO
763	ENDDO
764	!$acc end kernels
765	ENDDO
766	!$acc end data
767	!$OMP END PARALLEL
768	#endif
769
770	ENDIF
771
772	END SUBROUTINE transpose_zx
773
774
775	SUBROUTINE resort_for_zy( f_inv, f_out )
776
777	!------------------------------------------------------------------------------!
778	! Description:
779	! ------------
780	! Resorting data after the transposition from z to y. The transposition itself
781	! is carried out in transpose_zy
782	!------------------------------------------------------------------------------!
783
784	USE indices
785	USE transpose_indices
786
787	IMPLICIT NONE
788
789	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
790	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
791
792
793	INTEGER :: i, j, k
794
795	!
796	!-- Rearrange indices of input array in order to make data to be send
797	!-- by MPI contiguous
798	!$OMP PARALLEL PRIVATE ( i, j, k )
799	!$OMP DO
800	!$acc kernels present( f_inv, f_out )
801	!$acc loop
802	DO k = nzb_y, nzt_y
803	DO j = 0, ny
804	!$acc loop vector( 32 )
805	DO i = nxl_y, nxr_y
806	f_out(j,i,k) = f_inv(i,k,j)
807	ENDDO
808	ENDDO
809	ENDDO
810	!$acc end kernels
811	!$OMP END PARALLEL
812
813	END SUBROUTINE resort_for_zy
814
815
816	SUBROUTINE transpose_zy( f_in, f_inv )
817
818	!------------------------------------------------------------------------------!
819	! Description:
820	! ------------
821	! Transposition of input array (f_in) from z to y. For the input array, all
822	! elements along z reside on the same PE, while after transposition, all
823	! elements along y reside on the same PE.
824	!------------------------------------------------------------------------------!
825
826	USE cpulog
827	USE indices
828	USE interfaces
829	USE pegrid
830	USE transpose_indices
831
832	IMPLICIT NONE
833
834	INTEGER :: i, j, k, l, zs
835
836	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
837
838	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
839
840
841	!
842	!-- If the PE grid is one-dimensional along y, the array has only to be
843	!-- reordered locally and therefore no transposition has to be done.
844	IF ( pdims(1) /= 1 ) THEN
845
846	#if defined( __parallel )
847	!
848	!-- Reorder input array for transposition
849	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
850	!$OMP DO
851	!$acc data copyout( work )
852	DO l = 0, pdims(1) - 1
853	zs = 1 + l * ( nzt_y - nzb_y + 1 )
854	!$acc kernels present( f_in, work )
855	!$acc loop
856	DO j = nys_z, nyn_z
857	DO k = zs, zs + nzt_y - nzb_y
858	!$acc loop vector( 32 )
859	DO i = nxl_z, nxr_z
860	work(i,k-zs+1,j,l) = f_in(i,j,k)
861	ENDDO
862	ENDDO
863	ENDDO
864	!$acc end kernels
865	ENDDO
866	!$acc end data
867	!$OMP END PARALLEL
868
869	!
870	!-- Transpose array
871	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
872	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
873	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
874	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
875	comm1dx, ierr )
876	!$acc update device( f_inv )
877	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
878	#endif
879
880	ELSE
881	!
882	!-- Reorder the array in the same way like ALLTOALL did it
883	!$OMP PARALLEL PRIVATE ( i, j, k )
884	!$OMP DO
885	!$acc kernels present( f_in, f_inv )
886	!$acc loop
887	DO k = nzb_y, nzt_y
888	DO j = 0, ny
889	!$acc loop vector( 32 )
890	DO i = nxl_y, nxr_y
891	f_inv(i,k,j) = f_in(i,j,k)
892	ENDDO
893	ENDDO
894	ENDDO
895	!$acc end kernels
896	!$OMP END PARALLEL
897
898	ENDIF
899
900	END SUBROUTINE transpose_zy
901
902
903	SUBROUTINE transpose_zyd( f_in, f_out )
904
905	!------------------------------------------------------------------------------!
906	! Description:
907	! ------------
908	! Transposition of input array (f_in) from z to y. For the input array, all
909	! elements along z reside on the same PE, while after transposition, all
910	! elements along y reside on the same PE.
911	! This is a direct transposition for arrays with indices in regular order
912	! (k,j,i) (cf. transpose_zy).
913	!------------------------------------------------------------------------------!
914
915	USE cpulog
916	USE indices
917	USE interfaces
918	USE pegrid
919	USE transpose_indices
920
921	IMPLICIT NONE
922
923	INTEGER :: i, j, k, l, m, ys
924
925	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
926	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
927	work(nnxnnynnz)
928
929	#if defined( __parallel )
930
931	!
932	!-- Rearrange indices of input array in order to make data to be send
933	!-- by MPI contiguous
934	DO i = nxl, nxr
935	DO j = nys, nyn
936	DO k = 1, nz
937	f_inv(j,i,k) = f_in(k,j,i)
938	ENDDO
939	ENDDO
940	ENDDO
941
942	!
943	!-- Move data to different array, because memory location of work1 is
944	!-- needed further below (work1 = work2).
945	!-- If the PE grid is one-dimensional along x, only local reordering
946	!-- of the data is necessary and no transposition has to be done.
947	IF ( pdims(2) == 1 ) THEN
948	DO k = 1, nz
949	DO i = nxl, nxr
950	DO j = nys, nyn
951	f_out(j,i,k) = f_inv(j,i,k)
952	ENDDO
953	ENDDO
954	ENDDO
955	RETURN
956	ENDIF
957
958	!
959	!-- Transpose array
960	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
961	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
962	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
963	work(1), sendrecvcount_zyd, MPI_REAL, &
964	comm1dy, ierr )
965	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
966
967	!
968	!-- Reorder transposed array
969	m = 0
970	DO l = 0, pdims(2) - 1
971	ys = 0 + l * nny
972	DO k = nzb_yd, nzt_yd
973	DO i = nxl_yd, nxr_yd
974	DO j = ys, ys + nny - 1
975	m = m + 1
976	f_out(j,i,k) = work(m)
977	ENDDO
978	ENDDO
979	ENDDO
980	ENDDO
981
982	#endif
983
984	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |