Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 2113

Last change on this file since 2113 was 2101, checked in by suehring, 8 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 31.2 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2017 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 2101 2017-01-05 16:42:31Z kanani $
27	!
28	! 2000 2016-08-20 18:09:15Z knoop
29	! Forced header and separation lines into 80 columns
30	!
31	! 1682 2015-10-07 23:56:08Z knoop
32	! Code annotations made doxygen readable
33	!
34	! 1324 2014-03-21 09:13:16Z suehring
35	! Bugfix: ONLY statement for module pegrid removed
36	!
37	! 1320 2014-03-20 08:40:49Z raasch
38	! ONLY-attribute added to USE-statements,
39	! kind-parameters added to all INTEGER and REAL declaration statements,
40	! kinds are defined in new module kinds,
41	! old module precision_kind is removed,
42	! revision history before 2012 removed,
43	! comment fields (!:) to be used for variable explanations added to
44	! all variable declaration statements
45	!
46	! 1318 2014-03-17 13:35:16Z raasch
47	! cpu_log_nowait parameter added to cpu measurements of the transpositions
48	! required for solving the Poisson equation (poisfft),
49	! module interfaces removed
50	!
51	! 1257 2013-11-08 15:18:40Z raasch
52	! openacc loop and loop vector clauses removed
53	!
54	! 1216 2013-08-26 09:31:42Z raasch
55	! re-sorting of the transposed / to be transposed arrays moved to separate
56	! routines resort_for_...
57	!
58	! 1111 2013-03-08 23:54:10Z raasch
59	! openACC directives added,
60	! resorting data from/to work changed, work got 4 dimensions instead of 1
61	!
62	! 1106 2013-03-04 05:31:38Z raasch
63	! preprocessor lines rearranged so that routines can also be used in serial
64	! (non-parallel) mode
65	!
66	! 1092 2013-02-02 11:24:22Z raasch
67	! unused variables removed
68	!
69	! 1036 2012-10-22 13:43:42Z raasch
70	! code put under GPL (PALM 3.9)
71	!
72	! 1003 2012-09-14 14:35:53Z raasch
73	! indices nxa, nya, etc. replaced by nx, ny, etc.
74	!
75	! Revision 1.1 1997/07/24 11:25:18 raasch
76	! Initial revision
77	!
78	!------------------------------------------------------------------------------!
79	! Description:
80	! ------------
81	!> Resorting data for the transposition from x to y. The transposition itself
82	!> is carried out in transpose_xy
83	!------------------------------------------------------------------------------!
84	SUBROUTINE resort_for_xy( f_in, f_inv )
85
86
87	USE indices, &
88	ONLY: nx
89
90	USE kinds
91
92	USE transpose_indices, &
93	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
94
95	IMPLICIT NONE
96
97	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
98	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
99
100
101	INTEGER(iwp) :: i !<
102	INTEGER(iwp) :: j !<
103	INTEGER(iwp) :: k !<
104	!
105	!-- Rearrange indices of input array in order to make data to be send
106	!-- by MPI contiguous
107	!$OMP PARALLEL PRIVATE ( i, j, k )
108	!$OMP DO
109	!$acc kernels present( f_in, f_inv )
110	DO i = 0, nx
111	DO k = nzb_x, nzt_x
112	DO j = nys_x, nyn_x
113	f_inv(j,k,i) = f_in(i,j,k)
114	ENDDO
115	ENDDO
116	ENDDO
117	!$acc end kernels
118	!$OMP END PARALLEL
119
120	END SUBROUTINE resort_for_xy
121
122
123	!------------------------------------------------------------------------------!
124	! Description:
125	! ------------
126	!> Transposition of input array (f_in) from x to y. For the input array, all
127	!> elements along x reside on the same PE, while after transposition, all
128	!> elements along y reside on the same PE.
129	!------------------------------------------------------------------------------!
130	SUBROUTINE transpose_xy( f_inv, f_out )
131
132
133	USE cpulog, &
134	ONLY: cpu_log, cpu_log_nowait, log_point_s
135
136	USE indices, &
137	ONLY: nx, ny
138
139	USE kinds
140
141	USE pegrid
142
143	USE transpose_indices, &
144	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
145
146	IMPLICIT NONE
147
148	INTEGER(iwp) :: i !<
149	INTEGER(iwp) :: j !<
150	INTEGER(iwp) :: k !<
151	INTEGER(iwp) :: l !<
152	INTEGER(iwp) :: ys !<
153
154	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
155	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
156
157	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
158
159
160	IF ( numprocs /= 1 ) THEN
161
162	#if defined( __parallel )
163	!
164	!-- Transpose array
165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
166	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
167	!$acc update host( f_inv )
168	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
169	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
170	comm1dy, ierr )
171	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
172
173	!
174	!-- Reorder transposed array
175	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
176	!$OMP DO
177	!$acc data copyin( work )
178	DO l = 0, pdims(2) - 1
179	ys = 0 + l * ( nyn_x - nys_x + 1 )
180	!$acc kernels present( f_out, work )
181	DO i = nxl_y, nxr_y
182	DO k = nzb_y, nzt_y
183	DO j = ys, ys + nyn_x - nys_x
184	f_out(j,i,k) = work(j-ys+1,k,i,l)
185	ENDDO
186	ENDDO
187	ENDDO
188	!$acc end kernels
189	ENDDO
190	!$acc end data
191	!$OMP END PARALLEL
192	#endif
193
194	ELSE
195
196	!
197	!-- Reorder transposed array
198	!$OMP PARALLEL PRIVATE ( i, j, k )
199	!$OMP DO
200	!$acc kernels present( f_inv, f_out )
201	DO k = nzb_y, nzt_y
202	DO i = nxl_y, nxr_y
203	DO j = 0, ny
204	f_out(j,i,k) = f_inv(j,k,i)
205	ENDDO
206	ENDDO
207	ENDDO
208	!$acc end kernels
209	!$OMP END PARALLEL
210
211	ENDIF
212
213	END SUBROUTINE transpose_xy
214
215
216	!------------------------------------------------------------------------------!
217	! Description:
218	! ------------
219	!> Resorting data after the transposition from x to z. The transposition itself
220	!> is carried out in transpose_xz
221	!------------------------------------------------------------------------------!
222	SUBROUTINE resort_for_xz( f_inv, f_out )
223
224
225	USE indices, &
226	ONLY: nxl, nxr, nyn, nys, nz
227
228	USE kinds
229
230	IMPLICIT NONE
231
232	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
233	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
234
235	INTEGER(iwp) :: i !<
236	INTEGER(iwp) :: j !<
237	INTEGER(iwp) :: k !<
238	!
239	!-- Rearrange indices of input array in order to make data to be send
240	!-- by MPI contiguous.
241	!-- In case of parallel fft/transposition, scattered store is faster in
242	!-- backward direction!!!
243	!$OMP PARALLEL PRIVATE ( i, j, k )
244	!$OMP DO
245	!$acc kernels present( f_inv, f_out )
246	DO k = 1, nz
247	DO i = nxl, nxr
248	DO j = nys, nyn
249	f_out(k,j,i) = f_inv(j,i,k)
250	ENDDO
251	ENDDO
252	ENDDO
253	!$acc end kernels
254	!$OMP END PARALLEL
255
256	END SUBROUTINE resort_for_xz
257
258
259	!------------------------------------------------------------------------------!
260	! Description:
261	! ------------
262	!> Transposition of input array (f_in) from x to z. For the input array, all
263	!> elements along x reside on the same PE, while after transposition, all
264	!> elements along z reside on the same PE.
265	!------------------------------------------------------------------------------!
266	SUBROUTINE transpose_xz( f_in, f_inv )
267
268
269	USE cpulog, &
270	ONLY: cpu_log, cpu_log_nowait, log_point_s
271
272	USE indices, &
273	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
274
275	USE kinds
276
277	USE pegrid
278
279	USE transpose_indices, &
280	ONLY: nyn_x, nys_x, nzb_x, nzt_x
281
282	IMPLICIT NONE
283
284	INTEGER(iwp) :: i !<
285	INTEGER(iwp) :: j !<
286	INTEGER(iwp) :: k !<
287	INTEGER(iwp) :: l !<
288	INTEGER(iwp) :: xs !<
289
290	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
291	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
292
293	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
294
295
296	!
297	!-- If the PE grid is one-dimensional along y, the array has only to be
298	!-- reordered locally and therefore no transposition has to be done.
299	IF ( pdims(1) /= 1 ) THEN
300
301	#if defined( __parallel )
302	!
303	!-- Reorder input array for transposition
304	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
305	!$OMP DO
306	!$acc data copyout( work )
307	DO l = 0, pdims(1) - 1
308	xs = 0 + l * nnx
309	!$acc kernels present( f_in, work )
310	DO k = nzb_x, nzt_x
311	DO i = xs, xs + nnx - 1
312	DO j = nys_x, nyn_x
313	work(j,i-xs+1,k,l) = f_in(i,j,k)
314	ENDDO
315	ENDDO
316	ENDDO
317	!$acc end kernels
318	ENDDO
319	!$acc end data
320	!$OMP END PARALLEL
321
322	!
323	!-- Transpose array
324	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
325	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
326	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
327	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
328	comm1dx, ierr )
329	!$acc update device( f_inv )
330	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
331	#endif
332
333	ELSE
334
335	!
336	!-- Reorder the array in a way that the z index is in first position
337	!$OMP PARALLEL PRIVATE ( i, j, k )
338	!$OMP DO
339	!$acc kernels present( f_in, f_inv )
340	DO i = nxl, nxr
341	DO j = nys, nyn
342	DO k = 1, nz
343	f_inv(j,i,k) = f_in(i,j,k)
344	ENDDO
345	ENDDO
346	ENDDO
347	!$acc end kernels
348	!$OMP END PARALLEL
349
350	ENDIF
351
352	END SUBROUTINE transpose_xz
353
354
355	!------------------------------------------------------------------------------!
356	! Description:
357	! ------------
358	!> Resorting data after the transposition from y to x. The transposition itself
359	!> is carried out in transpose_yx
360	!------------------------------------------------------------------------------!
361	SUBROUTINE resort_for_yx( f_inv, f_out )
362
363
364	USE indices, &
365	ONLY: nx
366
367	USE kinds
368
369	USE transpose_indices, &
370	ONLY: nyn_x, nys_x, nzb_x, nzt_x
371
372	IMPLICIT NONE
373
374	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
375	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
376
377
378	INTEGER(iwp) :: i !<
379	INTEGER(iwp) :: j !<
380	INTEGER(iwp) :: k !<
381	!
382	!-- Rearrange indices of input array in order to make data to be send
383	!-- by MPI contiguous
384	!$OMP PARALLEL PRIVATE ( i, j, k )
385	!$OMP DO
386	!$acc kernels present( f_inv, f_out )
387	DO i = 0, nx
388	DO k = nzb_x, nzt_x
389	DO j = nys_x, nyn_x
390	f_out(i,j,k) = f_inv(j,k,i)
391	ENDDO
392	ENDDO
393	ENDDO
394	!$acc end kernels
395	!$OMP END PARALLEL
396
397	END SUBROUTINE resort_for_yx
398
399
400	!------------------------------------------------------------------------------!
401	! Description:
402	! ------------
403	!> Transposition of input array (f_in) from y to x. For the input array, all
404	!> elements along y reside on the same PE, while after transposition, all
405	!> elements along x reside on the same PE.
406	!------------------------------------------------------------------------------!
407	SUBROUTINE transpose_yx( f_in, f_inv )
408
409
410	USE cpulog, &
411	ONLY: cpu_log, cpu_log_nowait, log_point_s
412
413	USE indices, &
414	ONLY: nx, ny
415
416	USE kinds
417
418	USE pegrid
419
420	USE transpose_indices, &
421	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
422
423	IMPLICIT NONE
424
425	INTEGER(iwp) :: i !<
426	INTEGER(iwp) :: j !<
427	INTEGER(iwp) :: k !<
428	INTEGER(iwp) :: l !<
429	INTEGER(iwp) :: ys !<
430
431	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
432	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
433
434	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
435
436
437	IF ( numprocs /= 1 ) THEN
438
439	#if defined( __parallel )
440	!
441	!-- Reorder input array for transposition
442	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
443	!$OMP DO
444	!$acc data copyout( work )
445	DO l = 0, pdims(2) - 1
446	ys = 0 + l * ( nyn_x - nys_x + 1 )
447	!$acc kernels present( f_in, work )
448	DO i = nxl_y, nxr_y
449	DO k = nzb_y, nzt_y
450	DO j = ys, ys + nyn_x - nys_x
451	work(j-ys+1,k,i,l) = f_in(j,i,k)
452	ENDDO
453	ENDDO
454	ENDDO
455	!$acc end kernels
456	ENDDO
457	!$acc end data
458	!$OMP END PARALLEL
459
460	!
461	!-- Transpose array
462	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
463	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
464	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
465	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
466	comm1dy, ierr )
467	!$acc update device( f_inv )
468	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
469	#endif
470
471	ELSE
472
473	!
474	!-- Reorder array f_in the same way as ALLTOALL did it
475	!$OMP PARALLEL PRIVATE ( i, j, k )
476	!$OMP DO
477	!$acc kernels present( f_in, f_inv )
478	DO i = nxl_y, nxr_y
479	DO k = nzb_y, nzt_y
480	DO j = 0, ny
481	f_inv(j,k,i) = f_in(j,i,k)
482	ENDDO
483	ENDDO
484	ENDDO
485	!$acc end kernels
486	!$OMP END PARALLEL
487
488	ENDIF
489
490	END SUBROUTINE transpose_yx
491
492
493	!------------------------------------------------------------------------------!
494	! Description:
495	! ------------
496	!> Transposition of input array (f_in) from y to x. For the input array, all
497	!> elements along y reside on the same PE, while after transposition, all
498	!> elements along x reside on the same PE.
499	!> This is a direct transposition for arrays with indices in regular order
500	!> (k,j,i) (cf. transpose_yx).
501	!------------------------------------------------------------------------------!
502	SUBROUTINE transpose_yxd( f_in, f_out )
503
504
505	USE cpulog, &
506	ONLY: cpu_log, cpu_log_nowait, log_point_s
507
508	USE indices, &
509	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
510
511	USE kinds
512
513	USE pegrid
514
515	USE transpose_indices, &
516	ONLY: nyn_x, nys_x, nzb_x, nzt_x
517
518	IMPLICIT NONE
519
520	INTEGER(iwp) :: i !<
521	INTEGER(iwp) :: j !<
522	INTEGER(iwp) :: k !<
523	INTEGER(iwp) :: l !<
524	INTEGER(iwp) :: m !<
525	INTEGER(iwp) :: xs !<
526
527	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
528	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
529	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
530	REAL(wp) :: work(nnxnnynnz) !<
531	#if defined( __parallel )
532
533	!
534	!-- Rearrange indices of input array in order to make data to be send
535	!-- by MPI contiguous
536	DO k = 1, nz
537	DO j = nys, nyn
538	DO i = nxl, nxr
539	f_inv(i,k,j) = f_in(k,j,i)
540	ENDDO
541	ENDDO
542	ENDDO
543
544	!
545	!-- Transpose array
546	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
547	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
548	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
549	work(1), sendrecvcount_xy, MPI_REAL, &
550	comm1dx, ierr )
551	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
552
553	!
554	!-- Reorder transposed array
555	m = 0
556	DO l = 0, pdims(1) - 1
557	xs = 0 + l * nnx
558	DO j = nys_x, nyn_x
559	DO k = 1, nz
560	DO i = xs, xs + nnx - 1
561	m = m + 1
562	f_out(i,j,k) = work(m)
563	ENDDO
564	ENDDO
565	ENDDO
566	ENDDO
567
568	#endif
569
570	END SUBROUTINE transpose_yxd
571
572
573	!------------------------------------------------------------------------------!
574	! Description:
575	! ------------
576	!> Resorting data for the transposition from y to z. The transposition itself
577	!> is carried out in transpose_yz
578	!------------------------------------------------------------------------------!
579	SUBROUTINE resort_for_yz( f_in, f_inv )
580
581
582	USE indices, &
583	ONLY: ny
584
585	USE kinds
586
587	USE transpose_indices, &
588	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
589
590	IMPLICIT NONE
591
592	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
593	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
594
595	INTEGER(iwp) :: i !<
596	INTEGER(iwp) :: j !<
597	INTEGER(iwp) :: k !<
598
599	!
600	!-- Rearrange indices of input array in order to make data to be send
601	!-- by MPI contiguous
602	!$OMP PARALLEL PRIVATE ( i, j, k )
603	!$OMP DO
604	!$acc kernels present( f_in, f_inv )
605	DO j = 0, ny
606	DO k = nzb_y, nzt_y
607	DO i = nxl_y, nxr_y
608	f_inv(i,k,j) = f_in(j,i,k)
609	ENDDO
610	ENDDO
611	ENDDO
612	!$acc end kernels
613	!$OMP END PARALLEL
614
615	END SUBROUTINE resort_for_yz
616
617
618	!------------------------------------------------------------------------------!
619	! Description:
620	! ------------
621	!> Transposition of input array (f_in) from y to z. For the input array, all
622	!> elements along y reside on the same PE, while after transposition, all
623	!> elements along z reside on the same PE.
624	!------------------------------------------------------------------------------!
625	SUBROUTINE transpose_yz( f_inv, f_out )
626
627
628	USE cpulog, &
629	ONLY: cpu_log, cpu_log_nowait, log_point_s
630
631	USE indices, &
632	ONLY: ny, nz
633
634	USE kinds
635
636	USE pegrid
637
638	USE transpose_indices, &
639	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
640
641	IMPLICIT NONE
642
643	INTEGER(iwp) :: i !<
644	INTEGER(iwp) :: j !<
645	INTEGER(iwp) :: k !<
646	INTEGER(iwp) :: l !<
647	INTEGER(iwp) :: zs !<
648
649	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
650	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
651
652	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
653
654
655	!
656	!-- If the PE grid is one-dimensional along y, only local reordering
657	!-- of the data is necessary and no transposition has to be done.
658	IF ( pdims(1) == 1 ) THEN
659
660	!$OMP PARALLEL PRIVATE ( i, j, k )
661	!$OMP DO
662	!$acc kernels present( f_inv, f_out )
663	DO j = 0, ny
664	DO k = nzb_y, nzt_y
665	DO i = nxl_y, nxr_y
666	f_out(i,j,k) = f_inv(i,k,j)
667	ENDDO
668	ENDDO
669	ENDDO
670	!$acc end kernels
671	!$OMP END PARALLEL
672
673	ELSE
674
675	#if defined( __parallel )
676	!
677	!-- Transpose array
678	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
679	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
680	!$acc update host( f_inv )
681	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
682	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
683	comm1dx, ierr )
684	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
685
686	!
687	!-- Reorder transposed array
688	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
689	!$OMP DO
690	!$acc data copyin( work )
691	DO l = 0, pdims(1) - 1
692	zs = 1 + l * ( nzt_y - nzb_y + 1 )
693	!$acc kernels present( f_out )
694	DO j = nys_z, nyn_z
695	DO k = zs, zs + nzt_y - nzb_y
696	DO i = nxl_z, nxr_z
697	f_out(i,j,k) = work(i,k-zs+1,j,l)
698	ENDDO
699	ENDDO
700	ENDDO
701	!$acc end kernels
702	ENDDO
703	!$acc end data
704	!$OMP END PARALLEL
705	#endif
706
707	ENDIF
708
709	END SUBROUTINE transpose_yz
710
711
712	!------------------------------------------------------------------------------!
713	! Description:
714	! ------------
715	!> Resorting data for the transposition from z to x. The transposition itself
716	!> is carried out in transpose_zx
717	!------------------------------------------------------------------------------!
718	SUBROUTINE resort_for_zx( f_in, f_inv )
719
720
721	USE indices, &
722	ONLY: nxl, nxr, nyn, nys, nz
723
724	USE kinds
725
726	IMPLICIT NONE
727
728	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
729	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
730
731	INTEGER(iwp) :: i !<
732	INTEGER(iwp) :: j !<
733	INTEGER(iwp) :: k !<
734
735	!
736	!-- Rearrange indices of input array in order to make data to be send
737	!-- by MPI contiguous
738	!$OMP PARALLEL PRIVATE ( i, j, k )
739	!$OMP DO
740	!$acc kernels present( f_in, f_inv )
741	DO k = 1,nz
742	DO i = nxl, nxr
743	DO j = nys, nyn
744	f_inv(j,i,k) = f_in(k,j,i)
745	ENDDO
746	ENDDO
747	ENDDO
748	!$acc end kernels
749	!$OMP END PARALLEL
750
751	END SUBROUTINE resort_for_zx
752
753
754	!------------------------------------------------------------------------------!
755	! Description:
756	! ------------
757	!> Transposition of input array (f_in) from z to x. For the input array, all
758	!> elements along z reside on the same PE, while after transposition, all
759	!> elements along x reside on the same PE.
760	!------------------------------------------------------------------------------!
761	SUBROUTINE transpose_zx( f_inv, f_out )
762
763
764	USE cpulog, &
765	ONLY: cpu_log, cpu_log_nowait, log_point_s
766
767	USE indices, &
768	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
769
770	USE kinds
771
772	USE pegrid
773
774	USE transpose_indices, &
775	ONLY: nyn_x, nys_x, nzb_x, nzt_x
776
777	IMPLICIT NONE
778
779	INTEGER(iwp) :: i !<
780	INTEGER(iwp) :: j !<
781	INTEGER(iwp) :: k !<
782	INTEGER(iwp) :: l !<
783	INTEGER(iwp) :: xs !<
784
785	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
786	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
787
788	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
789
790
791	!
792	!-- If the PE grid is one-dimensional along y, only local reordering
793	!-- of the data is necessary and no transposition has to be done.
794	IF ( pdims(1) == 1 ) THEN
795
796	!$OMP PARALLEL PRIVATE ( i, j, k )
797	!$OMP DO
798	!$acc kernels present( f_inv, f_out )
799	DO k = 1, nz
800	DO i = nxl, nxr
801	DO j = nys, nyn
802	f_out(i,j,k) = f_inv(j,i,k)
803	ENDDO
804	ENDDO
805	ENDDO
806	!$acc end kernels
807	!$OMP END PARALLEL
808
809	ELSE
810
811	#if defined( __parallel )
812	!
813	!-- Transpose array
814	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
815	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
816	!$acc update host( f_inv )
817	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
818	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
819	comm1dx, ierr )
820	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
821
822	!
823	!-- Reorder transposed array
824	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
825	!$OMP DO
826	!$acc data copyin( work )
827	DO l = 0, pdims(1) - 1
828	xs = 0 + l * nnx
829	!$acc kernels present( f_out )
830	DO k = nzb_x, nzt_x
831	DO i = xs, xs + nnx - 1
832	DO j = nys_x, nyn_x
833	f_out(i,j,k) = work(j,i-xs+1,k,l)
834	ENDDO
835	ENDDO
836	ENDDO
837	!$acc end kernels
838	ENDDO
839	!$acc end data
840	!$OMP END PARALLEL
841	#endif
842
843	ENDIF
844
845	END SUBROUTINE transpose_zx
846
847
848	!------------------------------------------------------------------------------!
849	! Description:
850	! ------------
851	!> Resorting data after the transposition from z to y. The transposition itself
852	!> is carried out in transpose_zy
853	!------------------------------------------------------------------------------!
854	SUBROUTINE resort_for_zy( f_inv, f_out )
855
856
857	USE indices, &
858	ONLY: ny
859
860	USE kinds
861
862	USE transpose_indices, &
863	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
864
865	IMPLICIT NONE
866
867	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
868	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
869
870
871	INTEGER(iwp) :: i !<
872	INTEGER(iwp) :: j !<
873	INTEGER(iwp) :: k !<
874
875	!
876	!-- Rearrange indices of input array in order to make data to be send
877	!-- by MPI contiguous
878	!$OMP PARALLEL PRIVATE ( i, j, k )
879	!$OMP DO
880	!$acc kernels present( f_inv, f_out )
881	DO k = nzb_y, nzt_y
882	DO j = 0, ny
883	DO i = nxl_y, nxr_y
884	f_out(j,i,k) = f_inv(i,k,j)
885	ENDDO
886	ENDDO
887	ENDDO
888	!$acc end kernels
889	!$OMP END PARALLEL
890
891	END SUBROUTINE resort_for_zy
892
893
894	!------------------------------------------------------------------------------!
895	! Description:
896	! ------------
897	!> Transposition of input array (f_in) from z to y. For the input array, all
898	!> elements along z reside on the same PE, while after transposition, all
899	!> elements along y reside on the same PE.
900	!------------------------------------------------------------------------------!
901	SUBROUTINE transpose_zy( f_in, f_inv )
902
903
904	USE cpulog, &
905	ONLY: cpu_log, cpu_log_nowait, log_point_s
906
907	USE indices, &
908	ONLY: ny, nz
909
910	USE kinds
911
912	USE pegrid
913
914	USE transpose_indices, &
915	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
916
917	IMPLICIT NONE
918
919	INTEGER(iwp) :: i !<
920	INTEGER(iwp) :: j !<
921	INTEGER(iwp) :: k !<
922	INTEGER(iwp) :: l !<
923	INTEGER(iwp) :: zs !<
924
925	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
926	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
927
928	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
929
930	!
931	!-- If the PE grid is one-dimensional along y, the array has only to be
932	!-- reordered locally and therefore no transposition has to be done.
933	IF ( pdims(1) /= 1 ) THEN
934
935	#if defined( __parallel )
936	!
937	!-- Reorder input array for transposition
938	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
939	!$OMP DO
940	!$acc data copyout( work )
941	DO l = 0, pdims(1) - 1
942	zs = 1 + l * ( nzt_y - nzb_y + 1 )
943	!$acc kernels present( f_in, work )
944	DO j = nys_z, nyn_z
945	DO k = zs, zs + nzt_y - nzb_y
946	DO i = nxl_z, nxr_z
947	work(i,k-zs+1,j,l) = f_in(i,j,k)
948	ENDDO
949	ENDDO
950	ENDDO
951	!$acc end kernels
952	ENDDO
953	!$acc end data
954	!$OMP END PARALLEL
955
956	!
957	!-- Transpose array
958	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
959	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
960	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
961	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
962	comm1dx, ierr )
963	!$acc update device( f_inv )
964	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
965	#endif
966
967	ELSE
968	!
969	!-- Reorder the array in the same way like ALLTOALL did it
970	!$OMP PARALLEL PRIVATE ( i, j, k )
971	!$OMP DO
972	!$acc kernels present( f_in, f_inv )
973	DO k = nzb_y, nzt_y
974	DO j = 0, ny
975	DO i = nxl_y, nxr_y
976	f_inv(i,k,j) = f_in(i,j,k)
977	ENDDO
978	ENDDO
979	ENDDO
980	!$acc end kernels
981	!$OMP END PARALLEL
982
983	ENDIF
984
985	END SUBROUTINE transpose_zy
986
987
988	!------------------------------------------------------------------------------!
989	! Description:
990	! ------------
991	!> Transposition of input array (f_in) from z to y. For the input array, all
992	!> elements along z reside on the same PE, while after transposition, all
993	!> elements along y reside on the same PE.
994	!> This is a direct transposition for arrays with indices in regular order
995	!> (k,j,i) (cf. transpose_zy).
996	!------------------------------------------------------------------------------!
997	SUBROUTINE transpose_zyd( f_in, f_out )
998
999
1000	USE cpulog, &
1001	ONLY: cpu_log, cpu_log_nowait, log_point_s
1002
1003	USE indices, &
1004	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1005
1006	USE kinds
1007
1008	USE pegrid
1009
1010	USE transpose_indices, &
1011	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
1012
1013	IMPLICIT NONE
1014
1015	INTEGER(iwp) :: i !<
1016	INTEGER(iwp) :: j !<
1017	INTEGER(iwp) :: k !<
1018	INTEGER(iwp) :: l !<
1019	INTEGER(iwp) :: m !<
1020	INTEGER(iwp) :: ys !<
1021
1022	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1023	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1024	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1025	REAL(wp) :: work(nnxnnynnz) !<
1026
1027	#if defined( __parallel )
1028
1029	!
1030	!-- Rearrange indices of input array in order to make data to be send
1031	!-- by MPI contiguous
1032	DO i = nxl, nxr
1033	DO j = nys, nyn
1034	DO k = 1, nz
1035	f_inv(j,i,k) = f_in(k,j,i)
1036	ENDDO
1037	ENDDO
1038	ENDDO
1039
1040	!
1041	!-- Move data to different array, because memory location of work1 is
1042	!-- needed further below (work1 = work2).
1043	!-- If the PE grid is one-dimensional along x, only local reordering
1044	!-- of the data is necessary and no transposition has to be done.
1045	IF ( pdims(2) == 1 ) THEN
1046	DO k = 1, nz
1047	DO i = nxl, nxr
1048	DO j = nys, nyn
1049	f_out(j,i,k) = f_inv(j,i,k)
1050	ENDDO
1051	ENDDO
1052	ENDDO
1053	RETURN
1054	ENDIF
1055
1056	!
1057	!-- Transpose array
1058	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1059	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1060	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1061	work(1), sendrecvcount_zyd, MPI_REAL, &
1062	comm1dy, ierr )
1063	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1064
1065	!
1066	!-- Reorder transposed array
1067	m = 0
1068	DO l = 0, pdims(2) - 1
1069	ys = 0 + l * nny
1070	DO k = nzb_yd, nzt_yd
1071	DO i = nxl_yd, nxr_yd
1072	DO j = ys, ys + nny - 1
1073	m = m + 1
1074	f_out(j,i,k) = work(m)
1075	ENDDO
1076	ENDDO
1077	ENDDO
1078	ENDDO
1079
1080	#endif
1081
1082	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |