Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1318

Last change on this file since 1318 was 1318, checked in by raasch, 10 years ago
former files/routines cpu_log and cpu_statistics combined to one module, which also includes the former data module cpulog from the modules-file, module interfaces removed
Property svn:keywords set to `Id`
File size: 26.0 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! cpu_log_nowait parameter added to cpu measurements of the transpositions
23	! required for solving the Poisson equation (poisfft),
24	! module interfaces removed
25	!
26	! Former revisions:
27	! -----------------
28	! $Id: transpose.f90 1318 2014-03-17 13:35:16Z raasch $
29	!
30	! 1257 2013-11-08 15:18:40Z raasch
31	! openacc loop and loop vector clauses removed
32	!
33	! 1216 2013-08-26 09:31:42Z raasch
34	! re-sorting of the transposed / to be transposed arrays moved to separate
35	! routines resort_for_...
36	!
37	! 1111 2013-03-08 23:54:10Z raasch
38	! openACC directives added,
39	! resorting data from/to work changed, work got 4 dimensions instead of 1
40	!
41	! 1106 2013-03-04 05:31:38Z raasch
42	! preprocessor lines rearranged so that routines can also be used in serial
43	! (non-parallel) mode
44	!
45	! 1092 2013-02-02 11:24:22Z raasch
46	! unused variables removed
47	!
48	! 1036 2012-10-22 13:43:42Z raasch
49	! code put under GPL (PALM 3.9)
50	!
51	! 1003 2012-09-14 14:35:53Z raasch
52	! indices nxa, nya, etc. replaced by nx, ny, etc.
53	!
54	! 683 2011-02-09 14:25:15Z raasch
55	! openMP parallelization of transpositions for 2d-domain-decomposition
56	!
57	! 622 2010-12-10 08:08:13Z raasch
58	! optional barriers included in order to speed up collective operations
59	!
60	! 164 2008-05-15 08:46:15Z raasch
61	! f_inv changed from subroutine argument to automatic array in order to do
62	! re-ordering from f_in to f_inv in one step, one array work is needed instead
63	! of work1 and work2
64	!
65	! February 2007
66	! RCS Log replace by Id keyword, revision history cleaned up
67	!
68	! Revision 1.2 2004/04/30 13:12:17 raasch
69	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
70	! all former transpose-routine files collected in this file, enlarged
71	! transposition arrays introduced
72	!
73	! Revision 1.1 2004/04/30 13:08:16 raasch
74	! Initial revision (collection of former routines transpose_xy, transpose_xz,
75	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
76	!
77	! Revision 1.1 1997/07/24 11:25:18 raasch
78	! Initial revision
79	!
80	!------------------------------------------------------------------------------!
81	! Description:
82	! ------------
83	! Resorting data for the transposition from x to y. The transposition itself
84	! is carried out in transpose_xy
85	!------------------------------------------------------------------------------!
86
87	USE indices
88	USE transpose_indices
89
90	IMPLICIT NONE
91
92	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
93	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
94
95
96	INTEGER :: i, j, k
97
98	!
99	!-- Rearrange indices of input array in order to make data to be send
100	!-- by MPI contiguous
101	!$OMP PARALLEL PRIVATE ( i, j, k )
102	!$OMP DO
103	!$acc kernels present( f_in, f_inv )
104	DO i = 0, nx
105	DO k = nzb_x, nzt_x
106	DO j = nys_x, nyn_x
107	f_inv(j,k,i) = f_in(i,j,k)
108	ENDDO
109	ENDDO
110	ENDDO
111	!$acc end kernels
112	!$OMP END PARALLEL
113
114	END SUBROUTINE resort_for_xy
115
116
117	SUBROUTINE transpose_xy( f_inv, f_out )
118
119	!------------------------------------------------------------------------------!
120	! Description:
121	! ------------
122	! Transposition of input array (f_in) from x to y. For the input array, all
123	! elements along x reside on the same PE, while after transposition, all
124	! elements along y reside on the same PE.
125	!------------------------------------------------------------------------------!
126
127	USE cpulog
128	USE indices
129	USE pegrid
130	USE transpose_indices
131
132	IMPLICIT NONE
133
134	INTEGER :: i, j, k, l, ys
135
136	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
137
138	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
139
140
141	IF ( numprocs /= 1 ) THEN
142
143	#if defined( __parallel )
144	!
145	!-- Transpose array
146	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
147	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
148	!$acc update host( f_inv )
149	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
150	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
151	comm1dy, ierr )
152	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
153
154	!
155	!-- Reorder transposed array
156	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
157	!$OMP DO
158	!$acc data copyin( work )
159	DO l = 0, pdims(2) - 1
160	ys = 0 + l * ( nyn_x - nys_x + 1 )
161	!$acc kernels present( f_out, work )
162	DO i = nxl_y, nxr_y
163	DO k = nzb_y, nzt_y
164	DO j = ys, ys + nyn_x - nys_x
165	f_out(j,i,k) = work(j-ys+1,k,i,l)
166	ENDDO
167	ENDDO
168	ENDDO
169	!$acc end kernels
170	ENDDO
171	!$acc end data
172	!$OMP END PARALLEL
173	#endif
174
175	ELSE
176
177	!
178	!-- Reorder transposed array
179	!$OMP PARALLEL PRIVATE ( i, j, k )
180	!$OMP DO
181	!$acc kernels present( f_inv, f_out )
182	DO k = nzb_y, nzt_y
183	DO i = nxl_y, nxr_y
184	DO j = 0, ny
185	f_out(j,i,k) = f_inv(j,k,i)
186	ENDDO
187	ENDDO
188	ENDDO
189	!$acc end kernels
190	!$OMP END PARALLEL
191
192	ENDIF
193
194	END SUBROUTINE transpose_xy
195
196
197	SUBROUTINE resort_for_xz( f_inv, f_out )
198
199	!------------------------------------------------------------------------------!
200	! Description:
201	! ------------
202	! Resorting data after the transposition from x to z. The transposition itself
203	! is carried out in transpose_xz
204	!------------------------------------------------------------------------------!
205
206	USE indices
207	USE transpose_indices
208
209	IMPLICIT NONE
210
211	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
212	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
213
214
215	INTEGER :: i, j, k
216
217	!
218	!-- Rearrange indices of input array in order to make data to be send
219	!-- by MPI contiguous.
220	!-- In case of parallel fft/transposition, scattered store is faster in
221	!-- backward direction!!!
222	!$OMP PARALLEL PRIVATE ( i, j, k )
223	!$OMP DO
224	!$acc kernels present( f_inv, f_out )
225	DO k = 1, nz
226	DO i = nxl, nxr
227	DO j = nys, nyn
228	f_out(k,j,i) = f_inv(j,i,k)
229	ENDDO
230	ENDDO
231	ENDDO
232	!$acc end kernels
233	!$OMP END PARALLEL
234
235	END SUBROUTINE resort_for_xz
236
237
238	SUBROUTINE transpose_xz( f_in, f_inv )
239
240	!------------------------------------------------------------------------------!
241	! Description:
242	! ------------
243	! Transposition of input array (f_in) from x to z. For the input array, all
244	! elements along x reside on the same PE, while after transposition, all
245	! elements along z reside on the same PE.
246	!------------------------------------------------------------------------------!
247
248	USE cpulog
249	USE indices
250	USE pegrid
251	USE transpose_indices
252
253	IMPLICIT NONE
254
255	INTEGER :: i, j, k, l, xs
256
257	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
258
259	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
260
261
262	!
263	!-- If the PE grid is one-dimensional along y, the array has only to be
264	!-- reordered locally and therefore no transposition has to be done.
265	IF ( pdims(1) /= 1 ) THEN
266
267	#if defined( __parallel )
268	!
269	!-- Reorder input array for transposition
270	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
271	!$OMP DO
272	!$acc data copyout( work )
273	DO l = 0, pdims(1) - 1
274	xs = 0 + l * nnx
275	!$acc kernels present( f_in, work )
276	DO k = nzb_x, nzt_x
277	DO i = xs, xs + nnx - 1
278	DO j = nys_x, nyn_x
279	work(j,i-xs+1,k,l) = f_in(i,j,k)
280	ENDDO
281	ENDDO
282	ENDDO
283	!$acc end kernels
284	ENDDO
285	!$acc end data
286	!$OMP END PARALLEL
287
288	!
289	!-- Transpose array
290	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
291	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
292	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
293	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
294	comm1dx, ierr )
295	!$acc update device( f_inv )
296	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
297	#endif
298
299	ELSE
300
301	!
302	!-- Reorder the array in a way that the z index is in first position
303	!$OMP PARALLEL PRIVATE ( i, j, k )
304	!$OMP DO
305	!$acc kernels present( f_in, f_inv )
306	DO i = nxl, nxr
307	DO j = nys, nyn
308	DO k = 1, nz
309	f_inv(j,i,k) = f_in(i,j,k)
310	ENDDO
311	ENDDO
312	ENDDO
313	!$acc end kernels
314	!$OMP END PARALLEL
315
316	ENDIF
317
318	END SUBROUTINE transpose_xz
319
320
321	SUBROUTINE resort_for_yx( f_inv, f_out )
322
323	!------------------------------------------------------------------------------!
324	! Description:
325	! ------------
326	! Resorting data after the transposition from y to x. The transposition itself
327	! is carried out in transpose_yx
328	!------------------------------------------------------------------------------!
329
330	USE indices
331	USE transpose_indices
332
333	IMPLICIT NONE
334
335	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
336	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
337
338
339	INTEGER :: i, j, k
340
341	!
342	!-- Rearrange indices of input array in order to make data to be send
343	!-- by MPI contiguous
344	!$OMP PARALLEL PRIVATE ( i, j, k )
345	!$OMP DO
346	!$acc kernels present( f_inv, f_out )
347	DO i = 0, nx
348	DO k = nzb_x, nzt_x
349	DO j = nys_x, nyn_x
350	f_out(i,j,k) = f_inv(j,k,i)
351	ENDDO
352	ENDDO
353	ENDDO
354	!$acc end kernels
355	!$OMP END PARALLEL
356
357	END SUBROUTINE resort_for_yx
358
359
360	SUBROUTINE transpose_yx( f_in, f_inv )
361
362	!------------------------------------------------------------------------------!
363	! Description:
364	! ------------
365	! Transposition of input array (f_in) from y to x. For the input array, all
366	! elements along y reside on the same PE, while after transposition, all
367	! elements along x reside on the same PE.
368	!------------------------------------------------------------------------------!
369
370	USE cpulog
371	USE indices
372	USE pegrid
373	USE transpose_indices
374
375	IMPLICIT NONE
376
377	INTEGER :: i, j, k, l, ys
378
379	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
380
381	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
382
383
384	IF ( numprocs /= 1 ) THEN
385
386	#if defined( __parallel )
387	!
388	!-- Reorder input array for transposition
389	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
390	!$OMP DO
391	!$acc data copyout( work )
392	DO l = 0, pdims(2) - 1
393	ys = 0 + l * ( nyn_x - nys_x + 1 )
394	!$acc kernels present( f_in, work )
395	DO i = nxl_y, nxr_y
396	DO k = nzb_y, nzt_y
397	DO j = ys, ys + nyn_x - nys_x
398	work(j-ys+1,k,i,l) = f_in(j,i,k)
399	ENDDO
400	ENDDO
401	ENDDO
402	!$acc end kernels
403	ENDDO
404	!$acc end data
405	!$OMP END PARALLEL
406
407	!
408	!-- Transpose array
409	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
410	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
411	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
412	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
413	comm1dy, ierr )
414	!$acc update device( f_inv )
415	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
416	#endif
417
418	ELSE
419
420	!
421	!-- Reorder array f_in the same way as ALLTOALL did it
422	!$OMP PARALLEL PRIVATE ( i, j, k )
423	!$OMP DO
424	!$acc kernels present( f_in, f_inv )
425	DO i = nxl_y, nxr_y
426	DO k = nzb_y, nzt_y
427	DO j = 0, ny
428	f_inv(j,k,i) = f_in(j,i,k)
429	ENDDO
430	ENDDO
431	ENDDO
432	!$acc end kernels
433	!$OMP END PARALLEL
434
435	ENDIF
436
437	END SUBROUTINE transpose_yx
438
439
440	SUBROUTINE transpose_yxd( f_in, f_out )
441
442	!------------------------------------------------------------------------------!
443	! Description:
444	! ------------
445	! Transposition of input array (f_in) from y to x. For the input array, all
446	! elements along y reside on the same PE, while after transposition, all
447	! elements along x reside on the same PE.
448	! This is a direct transposition for arrays with indices in regular order
449	! (k,j,i) (cf. transpose_yx).
450	!------------------------------------------------------------------------------!
451
452	USE cpulog
453	USE indices
454	USE pegrid
455	USE transpose_indices
456
457	IMPLICIT NONE
458
459	INTEGER :: i, j, k, l, m, xs
460
461	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
462	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
463	work(nnxnnynnz)
464
465	#if defined( __parallel )
466
467	!
468	!-- Rearrange indices of input array in order to make data to be send
469	!-- by MPI contiguous
470	DO k = 1, nz
471	DO j = nys, nyn
472	DO i = nxl, nxr
473	f_inv(i,k,j) = f_in(k,j,i)
474	ENDDO
475	ENDDO
476	ENDDO
477
478	!
479	!-- Transpose array
480	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
481	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
482	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
483	work(1), sendrecvcount_xy, MPI_REAL, &
484	comm1dx, ierr )
485	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
486
487	!
488	!-- Reorder transposed array
489	m = 0
490	DO l = 0, pdims(1) - 1
491	xs = 0 + l * nnx
492	DO j = nys_x, nyn_x
493	DO k = 1, nz
494	DO i = xs, xs + nnx - 1
495	m = m + 1
496	f_out(i,j,k) = work(m)
497	ENDDO
498	ENDDO
499	ENDDO
500	ENDDO
501
502	#endif
503
504	END SUBROUTINE transpose_yxd
505
506
507	SUBROUTINE resort_for_yz( f_in, f_inv )
508
509	!------------------------------------------------------------------------------!
510	! Description:
511	! ------------
512	! Resorting data for the transposition from y to z. The transposition itself
513	! is carried out in transpose_yz
514	!------------------------------------------------------------------------------!
515
516	USE indices
517	USE transpose_indices
518
519	IMPLICIT NONE
520
521	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
522	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
523
524
525	INTEGER :: i, j, k
526
527	!
528	!-- Rearrange indices of input array in order to make data to be send
529	!-- by MPI contiguous
530	!$OMP PARALLEL PRIVATE ( i, j, k )
531	!$OMP DO
532	!$acc kernels present( f_in, f_inv )
533	DO j = 0, ny
534	DO k = nzb_y, nzt_y
535	DO i = nxl_y, nxr_y
536	f_inv(i,k,j) = f_in(j,i,k)
537	ENDDO
538	ENDDO
539	ENDDO
540	!$acc end kernels
541	!$OMP END PARALLEL
542
543	END SUBROUTINE resort_for_yz
544
545
546	SUBROUTINE transpose_yz( f_inv, f_out )
547
548	!------------------------------------------------------------------------------!
549	! Description:
550	! ------------
551	! Transposition of input array (f_in) from y to z. For the input array, all
552	! elements along y reside on the same PE, while after transposition, all
553	! elements along z reside on the same PE.
554	!------------------------------------------------------------------------------!
555
556	USE cpulog
557	USE indices
558	USE pegrid
559	USE transpose_indices
560
561	IMPLICIT NONE
562
563	INTEGER :: i, j, k, l, zs
564
565	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
566
567	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
568
569
570	!
571	!-- If the PE grid is one-dimensional along y, only local reordering
572	!-- of the data is necessary and no transposition has to be done.
573	IF ( pdims(1) == 1 ) THEN
574
575	!$OMP PARALLEL PRIVATE ( i, j, k )
576	!$OMP DO
577	!$acc kernels present( f_inv, f_out )
578	DO j = 0, ny
579	DO k = nzb_y, nzt_y
580	DO i = nxl_y, nxr_y
581	f_out(i,j,k) = f_inv(i,k,j)
582	ENDDO
583	ENDDO
584	ENDDO
585	!$acc end kernels
586	!$OMP END PARALLEL
587
588	ELSE
589
590	#if defined( __parallel )
591	!
592	!-- Transpose array
593	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
594	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
595	!$acc update host( f_inv )
596	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
597	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
598	comm1dx, ierr )
599	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
600
601	!
602	!-- Reorder transposed array
603	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
604	!$OMP DO
605	!$acc data copyin( work )
606	DO l = 0, pdims(1) - 1
607	zs = 1 + l * ( nzt_y - nzb_y + 1 )
608	!$acc kernels present( f_out )
609	DO j = nys_z, nyn_z
610	DO k = zs, zs + nzt_y - nzb_y
611	DO i = nxl_z, nxr_z
612	f_out(i,j,k) = work(i,k-zs+1,j,l)
613	ENDDO
614	ENDDO
615	ENDDO
616	!$acc end kernels
617	ENDDO
618	!$acc end data
619	!$OMP END PARALLEL
620	#endif
621
622	ENDIF
623
624	END SUBROUTINE transpose_yz
625
626
627	SUBROUTINE resort_for_zx( f_in, f_inv )
628
629	!------------------------------------------------------------------------------!
630	! Description:
631	! ------------
632	! Resorting data for the transposition from z to x. The transposition itself
633	! is carried out in transpose_zx
634	!------------------------------------------------------------------------------!
635
636	USE indices
637	USE transpose_indices
638
639	IMPLICIT NONE
640
641	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
642	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
643
644
645	INTEGER :: i, j, k
646
647	!
648	!-- Rearrange indices of input array in order to make data to be send
649	!-- by MPI contiguous
650	!$OMP PARALLEL PRIVATE ( i, j, k )
651	!$OMP DO
652	!$acc kernels present( f_in, f_inv )
653	DO k = 1,nz
654	DO i = nxl, nxr
655	DO j = nys, nyn
656	f_inv(j,i,k) = f_in(k,j,i)
657	ENDDO
658	ENDDO
659	ENDDO
660	!$acc end kernels
661	!$OMP END PARALLEL
662
663	END SUBROUTINE resort_for_zx
664
665
666	SUBROUTINE transpose_zx( f_inv, f_out )
667
668	!------------------------------------------------------------------------------!
669	! Description:
670	! ------------
671	! Transposition of input array (f_in) from z to x. For the input array, all
672	! elements along z reside on the same PE, while after transposition, all
673	! elements along x reside on the same PE.
674	!------------------------------------------------------------------------------!
675
676	USE cpulog
677	USE indices
678	USE pegrid
679	USE transpose_indices
680
681	IMPLICIT NONE
682
683	INTEGER :: i, j, k, l, xs
684
685	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
686
687	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
688
689
690	!
691	!-- If the PE grid is one-dimensional along y, only local reordering
692	!-- of the data is necessary and no transposition has to be done.
693	IF ( pdims(1) == 1 ) THEN
694
695	!$OMP PARALLEL PRIVATE ( i, j, k )
696	!$OMP DO
697	!$acc kernels present( f_inv, f_out )
698	DO k = 1, nz
699	DO i = nxl, nxr
700	DO j = nys, nyn
701	f_out(i,j,k) = f_inv(j,i,k)
702	ENDDO
703	ENDDO
704	ENDDO
705	!$acc end kernels
706	!$OMP END PARALLEL
707
708	ELSE
709
710	#if defined( __parallel )
711	!
712	!-- Transpose array
713	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
714	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
715	!$acc update host( f_inv )
716	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
717	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
718	comm1dx, ierr )
719	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
720
721	!
722	!-- Reorder transposed array
723	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
724	!$OMP DO
725	!$acc data copyin( work )
726	DO l = 0, pdims(1) - 1
727	xs = 0 + l * nnx
728	!$acc kernels present( f_out )
729	DO k = nzb_x, nzt_x
730	DO i = xs, xs + nnx - 1
731	DO j = nys_x, nyn_x
732	f_out(i,j,k) = work(j,i-xs+1,k,l)
733	ENDDO
734	ENDDO
735	ENDDO
736	!$acc end kernels
737	ENDDO
738	!$acc end data
739	!$OMP END PARALLEL
740	#endif
741
742	ENDIF
743
744	END SUBROUTINE transpose_zx
745
746
747	SUBROUTINE resort_for_zy( f_inv, f_out )
748
749	!------------------------------------------------------------------------------!
750	! Description:
751	! ------------
752	! Resorting data after the transposition from z to y. The transposition itself
753	! is carried out in transpose_zy
754	!------------------------------------------------------------------------------!
755
756	USE indices
757	USE transpose_indices
758
759	IMPLICIT NONE
760
761	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
762	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
763
764
765	INTEGER :: i, j, k
766
767	!
768	!-- Rearrange indices of input array in order to make data to be send
769	!-- by MPI contiguous
770	!$OMP PARALLEL PRIVATE ( i, j, k )
771	!$OMP DO
772	!$acc kernels present( f_inv, f_out )
773	DO k = nzb_y, nzt_y
774	DO j = 0, ny
775	DO i = nxl_y, nxr_y
776	f_out(j,i,k) = f_inv(i,k,j)
777	ENDDO
778	ENDDO
779	ENDDO
780	!$acc end kernels
781	!$OMP END PARALLEL
782
783	END SUBROUTINE resort_for_zy
784
785
786	SUBROUTINE transpose_zy( f_in, f_inv )
787
788	!------------------------------------------------------------------------------!
789	! Description:
790	! ------------
791	! Transposition of input array (f_in) from z to y. For the input array, all
792	! elements along z reside on the same PE, while after transposition, all
793	! elements along y reside on the same PE.
794	!------------------------------------------------------------------------------!
795
796	USE cpulog
797	USE indices
798	USE pegrid
799	USE transpose_indices
800
801	IMPLICIT NONE
802
803	INTEGER :: i, j, k, l, zs
804
805	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
806
807	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
808
809
810	!
811	!-- If the PE grid is one-dimensional along y, the array has only to be
812	!-- reordered locally and therefore no transposition has to be done.
813	IF ( pdims(1) /= 1 ) THEN
814
815	#if defined( __parallel )
816	!
817	!-- Reorder input array for transposition
818	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
819	!$OMP DO
820	!$acc data copyout( work )
821	DO l = 0, pdims(1) - 1
822	zs = 1 + l * ( nzt_y - nzb_y + 1 )
823	!$acc kernels present( f_in, work )
824	DO j = nys_z, nyn_z
825	DO k = zs, zs + nzt_y - nzb_y
826	DO i = nxl_z, nxr_z
827	work(i,k-zs+1,j,l) = f_in(i,j,k)
828	ENDDO
829	ENDDO
830	ENDDO
831	!$acc end kernels
832	ENDDO
833	!$acc end data
834	!$OMP END PARALLEL
835
836	!
837	!-- Transpose array
838	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
839	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
840	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
841	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
842	comm1dx, ierr )
843	!$acc update device( f_inv )
844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
845	#endif
846
847	ELSE
848	!
849	!-- Reorder the array in the same way like ALLTOALL did it
850	!$OMP PARALLEL PRIVATE ( i, j, k )
851	!$OMP DO
852	!$acc kernels present( f_in, f_inv )
853	DO k = nzb_y, nzt_y
854	DO j = 0, ny
855	DO i = nxl_y, nxr_y
856	f_inv(i,k,j) = f_in(i,j,k)
857	ENDDO
858	ENDDO
859	ENDDO
860	!$acc end kernels
861	!$OMP END PARALLEL
862
863	ENDIF
864
865	END SUBROUTINE transpose_zy
866
867
868	SUBROUTINE transpose_zyd( f_in, f_out )
869
870	!------------------------------------------------------------------------------!
871	! Description:
872	! ------------
873	! Transposition of input array (f_in) from z to y. For the input array, all
874	! elements along z reside on the same PE, while after transposition, all
875	! elements along y reside on the same PE.
876	! This is a direct transposition for arrays with indices in regular order
877	! (k,j,i) (cf. transpose_zy).
878	!------------------------------------------------------------------------------!
879
880	USE cpulog
881	USE indices
882	USE pegrid
883	USE transpose_indices
884
885	IMPLICIT NONE
886
887	INTEGER :: i, j, k, l, m, ys
888
889	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
890	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
891	work(nnxnnynnz)
892
893	#if defined( __parallel )
894
895	!
896	!-- Rearrange indices of input array in order to make data to be send
897	!-- by MPI contiguous
898	DO i = nxl, nxr
899	DO j = nys, nyn
900	DO k = 1, nz
901	f_inv(j,i,k) = f_in(k,j,i)
902	ENDDO
903	ENDDO
904	ENDDO
905
906	!
907	!-- Move data to different array, because memory location of work1 is
908	!-- needed further below (work1 = work2).
909	!-- If the PE grid is one-dimensional along x, only local reordering
910	!-- of the data is necessary and no transposition has to be done.
911	IF ( pdims(2) == 1 ) THEN
912	DO k = 1, nz
913	DO i = nxl, nxr
914	DO j = nys, nyn
915	f_out(j,i,k) = f_inv(j,i,k)
916	ENDDO
917	ENDDO
918	ENDDO
919	RETURN
920	ENDIF
921
922	!
923	!-- Transpose array
924	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
925	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
926	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
927	work(1), sendrecvcount_zyd, MPI_REAL, &
928	comm1dy, ierr )
929	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
930
931	!
932	!-- Reorder transposed array
933	m = 0
934	DO l = 0, pdims(2) - 1
935	ys = 0 + l * nny
936	DO k = nzb_yd, nzt_yd
937	DO i = nxl_yd, nxr_yd
938	DO j = ys, ys + nny - 1
939	m = m + 1
940	f_out(j,i,k) = work(m)
941	ENDDO
942	ENDDO
943	ENDDO
944	ENDDO
945
946	#endif
947
948	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |