Upload
galilea-sheward
View
240
Download
1
Embed Size (px)
Citation preview
Asynchronous I/O with MPI
Anthony Danalis
Basic Non-Blocking API
MPI_Isend()MPI_Irecv()MPI_Wait()
MPI_Waitall() MPI_Waitany()
MPI_Test() MPI_Testall() MPI_Testany()
API: sending
MPI_ISEND(buf, count, datatype, dest, tag, comm, request)
[ IN buf] initial address of send buffer (choice)
[ IN count] number of elements in send buffer (integer)
[ IN datatype] datatype of each send buffer element (handle)
[ IN dest] rank of destination (integer)
[ IN tag] message tag (integer)
[ IN comm] communicator (handle)
[ OUT request] communication request (handle)
API: receiving
MPI_IRECV (buf, count, datatype, source, tag, comm, request)
[ OUT buf] initial address of receive buffer (choice)
[ IN count] number of elements in receive buffer (integer)
[ IN datatype] datatype of each receive buffer element (handle)
[ IN source] rank of source (integer)
[ IN tag] message tag (integer)
[ IN comm] communicator (handle)
[ OUT request] communication request (handle)
API: blocking/polling
MPI_WAIT(request, status)
[ INOUT request] request (handle)
[ OUT status] status object (Status)
MPI_TEST(request, flag, status)
[ INOUT request] communication request (handle)
[ OUT flag] true if operation completed (logical)
[ OUT status] status object (Status)
Why N-B I/O? Example 1
CALL MPI_COMM_RANK(comm, rank, ierr)
IF(rank .EQ. 0) THEN
CALL MPI_ISEND(a(1), 10, MPI_REAL, 1, tag, comm, request, ierr)
CALL MPI_WAIT(request, status, ierr)
ELSE
CALL MPI_IRECV(a(1), 15, MPI_REAL, 0, tag, comm, request, ierr)
CALL MPI_WAIT(request, status, ierr)
END IF
Why N-B I/O? Example 2
CALL MPI_COMM_RANK(comm, rank, ierr)
IF(rank .EQ. 0) THEN
CALL MPI_ISEND(a(1), 10, MPI_REAL, 1, tag, comm, request, ierr)
**** do some computation to mask latency ****
CALL MPI_WAIT(request, status, ierr)
ELSE
CALL MPI_IRECV(a(1), 15, MPI_REAL, 0, tag, comm, request, ierr)
**** do some computation to mask latency ****
CALL MPI_WAIT(request, status, ierr)
END IF
Why N-B I/O? Deadlocks(?)
IF(rank .EQ. 0) THEN
CALL MPI_SEND()
CALL MPI_RECV()
ELSE
CALL MPI_SEND()
CALL MPI_RECV()
END IF
Blocking
IF(rank .EQ. 0) THEN
CALL MPI_ISEND()
CALL MPI_IRECV()
ELSE
CALL MPI_ISEND()
CALL MPI_IRECV()
END IF
CALL MPI_WAITALL()
Non-Blocking
Why N-B I/O? Efficiency (painless)
for(i=1; i<NP; i++){
sender = (NP+myRank-i)%NP;
receiver = (myRank+i)%NP;
MPI_Irecv(sender, ...)
MPI_Isend(receiver, ...)
}
MPI_Waitall()
1
2
0 1
1
2
1
12
2
Why N-B I/O? Overlapping
Communication / Computation
Communication / Communication
Asynchronous I/O
Overlap
Non-Blocking VS Asynchronous
Non-Blocking => function call returnsAsynchronous => CPU does other things
IF(rank .EQ. 0) THEN
CALL MPI_ISEND(a(1), 10, MPI_REAL, 1, tag, comm, request, ierr)
**** do some computation to mask latency ****
CALL MPI_WAIT(request, status, ierr)
ELSE
CALL MPI_IRECV(a(1), 15, MPI_REAL, 0, tag, comm, request, ierr)
**** do some computation to mask latency ****
CALL MPI_WAIT(request, status, ierr)
END IF
Multidimensional FFT
P1
P2
P2P1
I/O
I/O
1-D FFT1-D FFT
1-D FFT1-D FFT
1-D FFT
1-D F
FT
1-D F
FT
1-D F
FT
1-D F
FT
1-D F
FT
Dimension X
Dimension Y
FFT code: MPI_ALLTOALL()
DO IZ = 1, (NZ/NPROC) DO IY = 1, NYP CALL REAL_TO_COMPLEX( REAL_IN=AP(:,IY,IZ), COMPLEX_OUT=A3 ) DO IX = 1, (NX/2) IPROC = (IX-1)/((NX/2)/NPROC) IXEFF = IX - IPROC*((NX/2)/NPROC) CAT2(IXEFF,IY,IZ,IPROC+1) = A3(IX) ENDDO ENDDOENDDO
NT = NYP*(NZ/NPROC)*((NX/2)/NPROC)CALL MPI_BARRIER( MPI_COMM_WORLD, ERR )CALL MPI_ALLTOALL( CAT2(1,1,1,1), NT, MPI_DOUBLE_COMPLEX, & CAT(1,1,1,1), NT, MPI_DOUBLE_COMPLEX, MPI_COMM_WORLD, ERR )
FFT code: MPI_SEND() + MPI_RECV()
DO IZ = 1, (NZ/NPROC) DO IY = 1, NYP CALL REAL_TO_COMPLEX( REAL_IN=AP(:,IY,IZ), COMPLEX_OUT=A3 )
DO IPROC = 1, NPROC-1 REAL_RECEIVER = MOD(IPROC+MYNUM, NPROC) CHUNCK = ((NX/2)/NPROC)*REAL_RECEIVER+1 CALL MPI_SEND(A3(CHUNCK), ((NX/2)/NPROC), MPI_DOUBLE_COMPLEX, & REAL_RECEIVER, 0, MPI_COMM_WORLD, ERR)
REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC, NPROC) CALL MPI_RECV(DAN_A3(1), ((NX/2)/NPROC), MPI_DOUBLE_COMPLEX, & REAL_SUBMITER, 0, MPI_COMM_WORLD, STATUS, ERR ) DO IX = 1, ((NX/2)/NPROC) TARG_IZ = IZ + REAL_SUBMITER*(NZ/NPROC) AUX(TARG_IZ,IY,IX) = DAN_A3(IX); ENDDO ENDDO DO IX = 1, ((NX/2)/NPROC) TARG_IZ = IZ + MYNUM*(NZ/NPROC) AUX(TARG_IZ,IY,IX) = A3( IX+((NX/2)/NPROC)*MYNUM ); ENDDO
ENDDOENDDO
DO IZ = 1, (NZ/NPROC) DO IY = 1, NYP CALL REAL_TO_COMPLEX( REAL_IN=AP(:,IY,IZ), COMPLEX_OUT=A3 ) CALL MPI_WAITALL( (2*(NPROC-1)), REQST(1), STATS, ERR) IF( IY > 1 ) THEN DO IPROC = 1, NPROC-1 REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC, NPROC) DO IX = 1, ((NX/2)/NPROC) TARG_IZ = IZ + REAL_SUBMITER*(NZ/NPROC) AUX(TARG_IZ,IY-1,IX) = DAN_A3(IX, IPROC); ENDDO ENDDO ENDIF DO IPROC = 1, NPROC-1 REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC,NPROC) CALL MPI_IRECV( DAN_A3(1,IPROC), ((NX/2)/NPROC), MPI_DOUBLE_COMPLEX, REAL_SUBMITER, 0, & MPI_COMM_WORLD, REQST((2*IPROC-1)), ERR) REAL_RECEIVER = MOD(IPROC+MYNUM, NPROC) CHUNCK = ((NX/2)/NPROC)*REAL_RECEIVER+1 CALL MPI_ISEND(A3(CHUNCK), ((NX/2)/NPROC), MPI_DOUBLE_COMPLEX, REAL_RECEIVER, 0, & MPI_COMM_WORLD, REQST((2*IPROC)), ERR) ENDDO DO IX = 1, ((NX/2)/NPROC) TARG_IZ = IZ + MYNUM*(NZ/NPROC) AUX(TARG_IZ,IY,IX) = A3( IX+((NX/2)/NPROC)*MYNUM ); ENDDO ENDDO CALL MPI_WAITALL( (2*(NPROC-1)), REQST(1), STATS, ERR) DO IPROC = 1, NPROC-1 REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC, NPROC) TARG_IZ = IZ + REAL_SUBMITER*(NZ/NPROC) DO IX = 1, ((NX/2)/NPROC) AUX(TARG_IZ,NYP,IX) = DAN_A3(IX, IPROC); ENDDO ENDDO ENDDO
FFT code: MPI_iSEND() + MPI_iRECV()
race?
DO IZ = 1, (NZ/NPROC) !{ DO IY = 1, NYP !{ CALL REAL_TO_COMPLEX( REAL_IN=AP(:,IY,IZ), COMPLEX_OUT=A3(1,1+MOD(IY-1,K_LOOPS), 1+MOD((IY-1)/K_LOOPS,2))) IF( (IY > K_LOOPS) .AND. (MOD(IY,K_LOOPS) == 0) ) THEN !{ CALL MPI_WAITALL( (2*(NPROC-1)), REQST(1), STATS, ERR) DO IPROC = 1, NPROC-1 !{ REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC, NPROC) TARG_IZ = IZ + REAL_SUBMITER*(NZ/NPROC) DO IIK = 1, K_LOOPS !{ OLDIY = IY-2*K_LOOPS+IIK DO IX = 1, ((NX/2)/NPROC) !{ AUX(TARG_IZ,OLDIY,IX) = DAN_RECV(IX, IIK, IPROC); ENDDO !} ENDDO !} ENDDO !} ENDIF !} IF( MOD(IY,K_LOOPS) == 0 ) THEN !{ DO IPROC = 1, NPROC-1 !{ REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC,NPROC) CALL MPI_IRECV( DAN_RECV(1,1,IPROC), K_LOOPS*((NX/2)/NPROC), MPI_DOUBLE_COMPLEX, & REAL_SUBMITER, 0, MPI_COMM_WORLD, REQST((2*IPROC-1)), ERR) REAL_RECEIVER = MOD(IPROC+MYNUM, NPROC) CHNK_STR = ((NX/2)/NPROC)*REAL_RECEIVER+1 CHNK_END = CHNK_STR+((NX/2)/NPROC)-1 CALL MPI_ISEND(A3( CHNK_STR:CHNK_END, :, 1+MOD((IY-1)/K_LOOPS,2) ), K_LOOPS*((NX/2)/NPROC), & MPI_DOUBLE_COMPLEX, REAL_RECEIVER, 0, MPI_COMM_WORLD, REQST((2*IPROC)), ERR) ENDDO !} ENDIF !} DO IX = 1, ((NX/2)/NPROC) !{ TARG_IZ = IZ + MYNUM*(NZ/NPROC) AUX(TARG_IZ,IY,IX) = A3( IX+((NX/2)/NPROC)*MYNUM, 1+MOD(IY-1,K_LOOPS), 1+MOD((IY-1)/K_LOOPS,2)); ENDDO !} ENDDO !}
FFT code: MPI_iSEND() + MPI_iRECV() (K)
FFT code: MPI_iSEND() + MPI_iRECV() (K) cont.
LEFTOVERS = MOD(NYP,K_LOOPS) IF( LEFTOVERS /= 0 ) THEN !{ DO IPROC = 1, NPROC-1 !{ REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC, NPROC) CALL MPI_IRECV(LEFTOVERS_RECV(1,1,IPROC), LEFTOVERS*(NZ/NPROC), MPI_DOUBLE_COMPLEX, & REAL_SUBMITER, 0, MPI_COMM_WORLD, LASTREQST((2*IPROC-1)), ERR) REAL_RECEIVER = MOD(IPROC+MYNUM, NPROC) CHNK_STR = ((NX/2)/NPROC)*REAL_RECEIVER+1 CHNK_END = CHNK_STR+((NX/2)/NPROC)-1 CALL MPI_ISEND(A3( CHNK_STR:CHNK_END, 1:LEFTOVERS, 1+MOD((NYP-1)/K_LOOPS,2) ), LEFTOVERS*((NX/2)/NPROC),& MPI_DOUBLE_COMPLEX, REAL_RECEIVER, 0, MPI_COMM_WORLD, LASTREQST((2*IPROC)), ERR) ENDDO !} ENDIF !}
!!!!! deal with the chuncks where IY=K_LOOPS*(NYP/K_LOOPS)-K_LOOPS+1:K_LOOPS*(NYP/K_LOOPS) CALL MPI_WAITALL( (2*(NPROC-1)), REQST(1), STATS, ERR)
DO IPROC = 1, NPROC-1 !{ REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC, NPROC) TARG_IZ = IZ + REAL_SUBMITER*(NZ/NPROC) DO IIK = 1, K_LOOPS !{ OLDIY = NYP-LEFTOVERS-K_LOOPS+IIK DO IX = 1, ((NX/2)/NPROC) !{ AUX(TARG_IZ,OLDIY,IX) = DAN_RECV(IX, IIK, IPROC); ENDDO !} ENDDO !} ENDDO !}
FFT code: MPI_iSEND() + MPI_iRECV() (K) cont.
IF( LEFTOVERS /= 0 ) THEN !{ !!!!! deal with the chuncks where IY=NYP-MOD(NYP,K_LOOPS)+1:NYP CALL MPI_WAITALL( (2*(NPROC-1)), LASTREQST(1), STATS, ERR) DO IPROC = 1, NPROC-1 !{ REAL_SUBMITER = MOD(NPROC+MYNUM-IPROC, NPROC) TARG_IZ = IZ + REAL_SUBMITER*(NZ/NPROC) DO IIK = 1, LEFTOVERS !{ OLDIY = NYP-LEFTOVERS+IIK DO IX = 1, ((NX/2)/NPROC) !{ AUX(TARG_IZ,OLDIY,IX) = LEFTOVERS_RECV(IX, IIK, IPROC); ENDDO !} ENDDO !} ENDDO !} ENDIF !} ENDDO !}
Pseudocode: MPI_iSEND() + MPI_iRECV() (K&D)
do i = 1, NX do j = 1, NY kernel(indata[j], outdata[j])
if( (j > D*K) && (j%K == 0) ) then MPI_WAITALL(request[j-D]) endif
if( j%K == 0 ) then MPI_iSEND(outdata[j-K], SIZE=K) MPI_iRECV(recv_data, request[j]) endif enddo
do m = 0, D-1 MPI_WAITALL(request[NY-D+m]) enddo other_computation()enddo
Performance gain
Comput/Commun Overlapping
time
prog
ress
communicationcomputation