59
1 OpenMP *žƎŠƊƃƐŠ!ÿ Â¥SV áýV¹ÁŪƐŬƒ {i Ö

OpenMP · 2011-07-05 · OpenMP ... pc ?

  • Upload
    others

  • View
    19

  • Download
    0

Embed Size (px)

Citation preview

Page 1: OpenMP · 2011-07-05 · OpenMP ... pc ?

1

OpenMP

Page 2: OpenMP · 2011-07-05 · OpenMP ... pc ?

2

•!•!

–!OpenMP •! Open•! Advanced Topics

–!SMP Hybrid Programming –!OpenMP 3.0 (task)

•!

Page 3: OpenMP · 2011-07-05 · OpenMP ... pc ?

3

•!–!–!

•!

–!CPU–!–!–!

Page 4: OpenMP · 2011-07-05 · OpenMP ... pc ?

4

•!–! 3GHz, 10GHz

•!–! 65nm 45nm , 32nm(20?)

•!

•!–! VLIW! –! L3–! Intel Hyperthreading

•!–! CPU

® Pentium®

Page 5: OpenMP · 2011-07-05 · OpenMP ... pc ?

5

4

Page 6: OpenMP · 2011-07-05 · OpenMP ... pc ?

6

Page 7: OpenMP · 2011-07-05 · OpenMP ... pc ?

7

•!–!

•!–!

•! CPU

–!•!

•!

Page 8: OpenMP · 2011-07-05 · OpenMP ... pc ?

8

Page 9: OpenMP · 2011-07-05 · OpenMP ... pc ?

9

•! (Message Passing) –!–!–!–!–!

•! (shared memory) –!–! DSM on–!–!–!

Page 10: OpenMP · 2011-07-05 · OpenMP ... pc ?

10

for(i=0;i<1000; i++) S += A[i]!

1 2 3 4 1000

+ S

1 2 1000 250 251 500 501 750 751

+ + + +

+ S

Page 11: OpenMP · 2011-07-05 · OpenMP ... pc ?

11

•!–!–!–!–! POSIX pthread

Page 12: OpenMP · 2011-07-05 · OpenMP ... pc ?

12

POSIX

•!

for(t=1;t<n_thd;t++){ r=pthread_create(thd_main,t) } thd_main(0); for(t=1; t<n_thd;t++)

pthread_join();

Pthread, Solaris thread

•!•!int s; /* global */ int n_thd; /* number of threads */ int thd_main(int id) { int c,b,e,i,ss; c=1000/n_thd; b=c*id; e=s+c; ss=0; for(i=b; i<e; i++) ss += a[i]; pthread_lock(); s += ss; pthread_unlock(); return s; }

Page 13: OpenMP · 2011-07-05 · OpenMP ... pc ?

13

OpenMP

#pragma omp parallel for reduction(+:s) for(i=0; i<1000;i++) s+= a[i];

OK!

Page 14: OpenMP · 2011-07-05 · OpenMP ... pc ?

14

OpenMP•!

–! (Fortran/C/C++) directive

•! ISV–! Oct. 1997 Fortran ver.1.0 API –! Oct. 1998 C/C++ ver.1.0 API –! OpenMP 3.0

•! URL

–! http://www.openmp.org/

Page 15: OpenMP · 2011-07-05 · OpenMP ... pc ?

15

OpenMP•!

–!

•!–!

•! OpenMP–!

•!–!–! 5% 95% (?) 5%

•!–! small-scale( 16 medium-scale ( 64–!

•! pthread OS-oriented, general-purpose

Page 16: OpenMP · 2011-07-05 · OpenMP ... pc ?

16

OpenMP API •!

–! directives/pragma

–! Fortran77, f90, C, C++ •! Fortran !$OMP•! C: #pragma omp pragma

•!–!

•!–! incremental–!–!

Page 17: OpenMP · 2011-07-05 · OpenMP ... pc ?

17

OpenMP•!•! Fork-join•! parallel region

–!

… A ...!#pragma omp parallel { foo(); /* ..B... */!}!… C …. #pragma omp parallel { … D … }!… E ...!

Call foo() Call foo() Call foo() Call foo()

A

B

C

D

E

fork

join

Page 18: OpenMP · 2011-07-05 · OpenMP ... pc ?

18

Parallel Region

•! (team)–! Parallel–! Parallel region team–! region team

•!

!$OMP PARALLEL! … … parallel region ... !$OMP END PARALLEL

#pragma omp parallel { ... ... Parallel region... ... }

Fortran: C:

Page 19: OpenMP · 2011-07-05 · OpenMP ... pc ?

19

•! /proc/cpuinfo •! gcc –fopenmp, gcc 4.2 , gfortran •!•! OMP_NUM_THREADS

#include <omp.h> #include <stdio.h> main() { printf("omp-test ... n_thread=%d\n",omp_get_max_threads()); #pragma omp parallel {

printf("thread (%d/%d)...\n", omp_get_thread_num(),omp_get_num_threads());

} printf("end...\n"); }

Page 20: OpenMP · 2011-07-05 · OpenMP ... pc ?

20

Work sharing•! Team

–! parallel region–! for

•!•!

–! sections•!•!

–! single•!

–! parallel •!parallel for •!parallel sections

directiveswork-sharing, sync

Duplicated execution

thread1 thread2 thread3

Page 21: OpenMP · 2011-07-05 · OpenMP ... pc ?

21

For•! For DO•! for canonical shape

–! var private–! incr-expr

•!++var,var++,--var,var--,var+=incr,var-=incr –! logical-op

•!–! break

–! clause

#pragma omp for [clause…] for(var=lb; var logical-op ub; incr-expr) body

Page 22: OpenMP · 2011-07-05 · OpenMP ... pc ?

22

Page 23: OpenMP · 2011-07-05 · OpenMP ... pc ?

23

Page 24: OpenMP · 2011-07-05 · OpenMP ... pc ?

24

Matvec(double a[],int row_start,int col_idx[], double x[],double y[],int n) { int i,j,start,end; double t; #pragma omp parallel for private(j,t,start,end) for(i=0; i<n;i++){ start=row_start[i]; end=row_start[i+1]; t = 0.0; for(j=start;j<end;j++) t += a[j]*x[col_idx[j]]; y[i]=t; } }

X y

a[col_idx[j]]

A

a

Page 25: OpenMP · 2011-07-05 · OpenMP ... pc ?

25

•!

schedule(static,n)

Schedule(static)

Schedule(dynamic,n)

Schedule(guided,n)

n Iteration space

Page 26: OpenMP · 2011-07-05 · OpenMP ... pc ?

26

Data scope•! parallel work sharing•! shared(var_list)

–!•! private(var_list)

–! private •! firstprivate(var_list)

–! private•! lastprivate(var_list)

–! private

•! reduction(op:var_list) –! reduction–! private

Page 27: OpenMP · 2011-07-05 · OpenMP ... pc ?

27

Data Race

Data Race =

OpenMP

Page 28: OpenMP · 2011-07-05 · OpenMP ... pc ?

28

Page 29: OpenMP · 2011-07-05 · OpenMP ... pc ?

29

Barrier •!

–!–! flush–! work sharing nowait

#pragma omp barrier

Page 30: OpenMP · 2011-07-05 · OpenMP ... pc ?

30

for implicit

Page 31: OpenMP · 2011-07-05 · OpenMP ... pc ?

31

nowait

Page 32: OpenMP · 2011-07-05 · OpenMP ... pc ?

32

•! single

•! master

•! section

•! critical

•! flush•! threadprivate

Page 33: OpenMP · 2011-07-05 · OpenMP ... pc ?

33

OpenMP MPI cpi

•!•! MPICH

•! OpenMP–! , 1

•! MPI (cpi-mpi.c) –! n Bcast –! reduction –!

Page 34: OpenMP · 2011-07-05 · OpenMP ... pc ?

34

OpenMP#include <stdio.h> #include <math.h> double f( double ); double f( double a ) { return (4.0 / (1.0 + a*a)); } int main( int argc, char *argv[]) { int n, i; double PI25DT = 3.141592653589793238462643; double pi, h, sum, x; scanf(“%d",&n); h = 1.0 / (double) n; sum = 0.0; #pragma omp parallel for private(x) reduction(+:sum) for (i = 1; i <= n; i++){

x = h * ((double)i - 0.5); sum += f(x);

} pi = h * sum; printf("pi is approximately %.16f, Error is %.16f\n",

pi, fabs(pi - PI25DT)); return 0; }

Page 35: OpenMP · 2011-07-05 · OpenMP ... pc ?

35

OpenMP laplace

•! Laplace–! 4 update–! Old new–!–!

•! OpenMP lap.c –! 3

•! OpenMP–! Parallel for

•! MPI–!

Page 36: OpenMP · 2011-07-05 · OpenMP ... pc ?

36

/* * Laplace equation with explict method */ #include <stdio.h> #include <math.h> /* square region */ #define XSIZE 1000 #define YSIZE 1000 #define PI 3.1415927 #define NITER 100 double u[XSIZE+2][YSIZE+2],uu[XSIZE+2][YSIZE+2]; double time1,time2; double second(); void initialize(); void lap_solve(); main() { initialize(); time1 = second(); lap_solve(); time2 = second(); printf("time=%g\n",time2-time1); exit(0); }

Page 37: OpenMP · 2011-07-05 · OpenMP ... pc ?

37

void lap_solve() { int x,y,k; double sum; #pragma omp parallel private(k,x,y) { for(k = 0; k < NITER; k++){

/* old <- new */ #pragma omp for

for(x = 1; x <= XSIZE; x++) for(y = 1; y <= YSIZE; y++) uu[x][y] = u[x][y]; /* update */

#pragma omp for for(x = 1; x <= XSIZE; x++) for(y = 1; y <= YSIZE; y++) u[x][y] = (uu[x-1][y] + uu[x+1][y] + uu[x][y-1] + uu[x][y+1])/4.0;

} } /* check sum */ sum = 0.0; #pragma omp parallel for private(y) reduction(+:sum) for(x = 1; x <= XSIZE; x++)

for(y = 1; y <= YSIZE; y++) sum += (uu[x][y]-u[x][y]);

printf("sum = %g\n",sum); }

Page 38: OpenMP · 2011-07-05 · OpenMP ... pc ?

38

void initialize() { int x,y; /* initalize */ for(x = 1; x <= XSIZE; x++) for(y = 1; y <= YSIZE; y++)

u[x][y] = sin((double)(x-1)/XSIZE*PI) + cos((double)(y-1)/YSIZE*PI); for(x = 0; x < (XSIZE+2); x++){

u[x][0] = 0.0; u[x][YSIZE+1] = 0.0; uu[x][0] = 0.0; uu[x][YSIZE+1] = 0.0;

} for(y = 0; y < (YSIZE+2); y++){

u[0][y] = 0.0; u[XSIZE+1][y] = 0.0; uu[0][y] = 0.0; uu[XSIZE+1][y] = 0.0;

} }

Page 39: OpenMP · 2011-07-05 · OpenMP ... pc ?

39

•!•!

–! gain

•! Web•!

Page 40: OpenMP · 2011-07-05 · OpenMP ... pc ?

40

Laplace AMD Opteron quad , 2 socket

XSIZE=YSIZE=1000 XSIZE=YSIZE=8000

Page 41: OpenMP · 2011-07-05 · OpenMP ... pc ?

41

Laplace Core i7 920 @ 2.67GHz, 2 socket

XSIZE=YSIZE=1000 XSIZE=YSIZE=8000

Page 42: OpenMP · 2011-07-05 · OpenMP ... pc ?

42

•! OpenMP

Page 43: OpenMP · 2011-07-05 · OpenMP ... pc ?

43

CC-NUMA first touch

Page 44: OpenMP · 2011-07-05 · OpenMP ... pc ?

44

First touch

2 socket Nehalem

Page 45: OpenMP · 2011-07-05 · OpenMP ... pc ?

45

Advanced topics

•! OpenMP 3.0 –!2007 approve

•! MPI/OpenMP Hybrid Programming –!SMP

Page 46: OpenMP · 2011-07-05 · OpenMP ... pc ?

46

OpenMP3.0•!

–! Parallel Task–! task–! taskwait

•!–! Flush

•!–! Collapse

•!•! private constructor, destructor

Openmp.org

Page 47: OpenMP · 2011-07-05 · OpenMP ... pc ?

47

Task

parallel

Page 48: OpenMP · 2011-07-05 · OpenMP ... pc ?

48

Stephen Olivier, Jan Prins, Evaluating OpenMP 3.0 Run Time Systems on Unbalanced Task Graphs, presented in IWOMP 2009

Page 49: OpenMP · 2011-07-05 · OpenMP ... pc ?

49

Stephen Olivier, Jan Prins, Evaluating OpenMP 3.0 Run Time Systems on Unbalanced Task Graphs, presented in IWOMP 2009

Page 50: OpenMP · 2011-07-05 · OpenMP ... pc ?

50

Stack

OMP_STACKSIZE

Page 51: OpenMP · 2011-07-05 · OpenMP ... pc ?

51

SMP•! PC-based SMP

–!

•! Middle scale Server–! ASCI Blue Mountain, O2K –! T2K Open Supercomputer

•! vector supercomputer–! Hitachi SR11000 –! SX-6, 7, 8?

SMPSMP

SMP)

Page 52: OpenMP · 2011-07-05 · OpenMP ... pc ?

52

MPI OpenMP Hybrid•! MPI SMP OpenMP

•! MPI+OpenMP –! MPI–!

•! SMP

•! OpenMP+MPI –! OpenMP–! single master critical

•! thread-Safe MPI•!

–! MPI –! OpenMP threadprivate

•! SMP–!

Page 53: OpenMP · 2011-07-05 · OpenMP ... pc ?

53

Thread-safety of MPI •! MPI_THREAD_SINGLE

–! A process has only one thread of execution.

•! MPI_THREAD_FUNNELED –! A process may be multithreaded, but only the thread that initialized MPI

can make MPI calls.

•! MPI_THREAD_SERIALIZED –! A process may be multithreaded, but only one thread at a time can

make MPI calls.

•! MPI_THREAD_MULTIPLE –! A process may be multithreaded and multiple threads can call MPI

functions simultaneously.

•! MPI_Init_thread

Page 54: OpenMP · 2011-07-05 · OpenMP ... pc ?

54

•! Hybrid–! flat-MPI” SMP–!

•!•!

•! Hybrid–!

SMP

Page 55: OpenMP · 2011-07-05 · OpenMP ... pc ?

55 55

0

20

40

60

80

100

120

140

160

180

16 OM

P/M

PI

4 OM

P/M

PI

flat MP

I

16 OM

P/M

PI

4 OM

P/M

PI

flat MP

I

16 OM

P/M

PI

4 OM

P/M

PI

flat MP

I

16 OM

P/M

PI

4 OM

P/M

PI

flat MP

I

16 OM

P/M

PI

4 OM

P/M

PI

flat MP

I

16 OM

P/M

PI

4 OM

P/M

PI

flat MP

I

RotV

pzheedv

hpsi

MatE

etc

PC

HPSI

SD CG

GS

SD

CG

GS

4OMP/MPI(sec

)

RS-DFT on T2K SMP OpenMP/MPI NPB RSDFT

2009-HPC-119 pp. 163-168, 2009.

Page 56: OpenMP · 2011-07-05 · OpenMP ... pc ?

56

•!

•! OpenMP •!

•! MPI–!–! MPI

•!–!–!–!

!

Page 57: OpenMP · 2011-07-05 · OpenMP ... pc ?

57

•! OpenMP

–!

–! N wi pi(knapsack) W

–!

–! Task–!

Page 58: OpenMP · 2011-07-05 · OpenMP ... pc ?

58

#define MAX_N 100 int N; /* / int Cap; /* / int W[MAX_N]; /* / int P[MAX_N]; /* / int main() { int opt; read_data_file(“test.dat”); opt = knap_search(0,0,Cap); printf(“opt=%d\n”,opt); exit(0); }

read_data_file(file) char *file; { FILE *fp; int i; fp = fopen(file,"r"); fscanf(fp,"%d",&N); fscanf(fp,"%d",&Cap); for(i = 0; i < N; i++) fscanf(fp,"%d",&W[i]); for(i = 0; i < N; i++) fscanf(fp,"%d",&P[i]); fclose(fp); }

Page 59: OpenMP · 2011-07-05 · OpenMP ... pc ?

59

int knap_search(int i,int cp, int M) { int Opt; int l,r; if (i < N && M > 0){ if(M >= W[i]){

l = knap_seach(i+1,cp+P[i],M-W[i]); r = knap_serach(i+1,cp,M);

if(l > r) Opt = l; else Opt = r; } else Opt = knap_search(i+1,cp,M); } else Opt = cp; return(Opt); }