Heterogeneous Data-Parallel Programming - Nvidiadeveloper.download.nvidia.com/GTC/Satnam_Singh...XC6VLX760 758,784 logic cells, 864 DSP blocks, 1,440 dual ported 18Kb RAMs 32-bit integer

Heterogeneous Data-Parallel Programming

Satnam Singh, The University of Birmingham, UK

locks monitors condition variables spin locks priority inversion

XC6VLX760 758,784 logic cells, 864 DSP blocks, 1,440 dual ported 18Kb RAMs

32-bit integer Adder (32/474,240) >700MHz

332x1440

14820 sim-adds 1,037,400,000,000 additions/second

XD2000i FPGA in-socket accelerator for Intel FSB

XD2000F FPGA in-socket accelerator for AMD socket F

XD1000 FPGA co-processor module for socket 940

time

information processing gap

Vo

lum

e O

f In

form

atio

n

processing with multi-core processors

processing with specialized processors and heterogeneous systems

2011

CUDAC Programmers

Java/Python/C#/F#/Perl Programmer

OpenMP

#include <stdio.h>

int main(int argc, char* argv[])

{

const unsigned int n = 5000000 ;

float *a = new float[n];

float *b = new float[n];

float *c = new float[n];

int i, j ;

#pragma omp parallel for

for (i=0; i<n; i++)

c[i] = a[i] + b[i] ;

return 0;

}

SSE2: ADDPS __m128 _mm_add_ps (__m128 a , __m128 b );

r0 := x0 + y0 r1 := x1 + y1 r2 := x2 + y2 r3 := x3 + y3

128-bits MMX/

universal language?

embedded high level software

FPGA

GPU

DSP

machine learning

grand unification theory polygots

Gannet

DSLs

Microsoft Redmond Accelerator Team

Barry Bond Kerry Hammil

Lubomir Litchev <anonymous other person>

Effort vs. Reward (Productivity)

low effort low reward

high effort high reward

medium effort medium reward

CUDAC OpenCL HLSL DirectCompute

Thurst Accelerator

Accelerator

DRAM

using System;

using Microsoft.ParallelArrays;

namespace AddArraysPointwise

{

class AddArraysPointwiseDX9

{

static void Main(string[] args)

{

var x = new FloatParallelArray (new[] {1.0F, 2, 3, 4, 5});

var y = new FloatParallelArray (new[] {6.0F, 7, 8, 9, 10});

var dx9Target = new DX9Target();

var z = x + y;

foreach (var i in dx9Target.ToArray1D (z))

Console.Write(i + " ");

Console.WriteLine();

}

}

}

ps_3_0 dcl_2d s0 dcl_texcoord0 v0.xy dcl_2d s1 texld r0, v0, s0 texld r1, v0, s1 add r1, r0, r1 mov oC0, r1

[1; 2; 3; 4; 5]

FloatParallelArray

CPU Address Space

C# Array

GPU Address Space

Encapsulated Data-parallel array

x

[6; 7; 8; 9; 10]

FloatParallelArray

y

100010101101011010

x+y GPU code

GPU memory

GPU code

y

[7; 9; 11; 13; 15]

C# Array

using System;


namespace AddArraysPointwiseMulticore

{

class AddArraysPointwiseMulticore

{


{



var multicoreTarget = new X64MulticoreTarget();

var z = x + y;

foreach (var i in multicoreTarget.ToArray1D (z))

Console.Write(i + " ");


}

}

}

using System;


namespace AddArraysPointwiseFPGA

{

class AddArraysPointwiseMulticore

{


{



var fpgaTarget = new FPGATarget();

var z = x + y;

fpgaTarget.ToArray1D (z) ;

}

}

}

open System

open Microsoft.ParallelArrays

let main(args) =

let x = new FloatParallelArray (Array.map float32 [|1; 2; 3; 4; 5 |])

let y = new FloatParallelArray (Array.map float32 [|6; 7; 8; 9; 10 |])

let z = x + y

use dx9Target = new DX9Target()

let zv = dx9Target.ToArray1D(z)

printf "%A\n" zv

0

rX *

pa

Shift (0,0)

k[0]

+

+

*

Shift (0,1)

k[1]

+

…

let rec convolve (shifts : int -> int []) (kernel : float32 []) i (a : FloatParallelArray)

= let e = kernel.[i] * ParallelArrays.Shift(a, shifts i)

if i = 0 then

e

else

e + convolve shifts kernel (i-1) a

namespace AddArrays1D

{

class AddArrays1D

{


{

FloatParallelArray a = new FloatParallelArray(new float[] {1.0f, 2.0f, 3.0f, 4.0f});

FloatParallelArray b = new FloatParallelArray(new float[] {5.0f, 6.0f, 7.0f, 8.0f });

FloatParallelArray c = a + b;

Target gpuTarget = new DX9Target();

float[] result = gpuTarget.ToArray1D(c);

foreach (float f in result)

Console.Write(f + " ");


}

}

}


using System;


using FPA = Microsoft.ParallelArrays.FloatParallelArray;

namespace MultiplyAdd1D

{

class MultiplyAdd1D

{


{

FPA a = new FPA(new float[] { 1.0f, 2.0f, 3.0f, 4.0f });

FPA b = new FPA(new float[] { 5.0f, 6.0f, 7.0f, 8.0f });

FPA c = new FPA(new float[] { 9.0f, 10.0f, 11.0f, 12.0f });

FPA d = ParallelArrays.MultiplyAdd(a, b, c);


float[] result = gpuTarget.ToArray1D(d);

foreach (float f in result)

Console.Write(f + " ");


}

}

}

ps_3_0 dcl_2d s0 dcl_texcoord0 v0.xy dcl_2d s1 dcl_2d s2 texld r0, v0, s0 texld r1, v0, s1 texld r2, v0, s2 mad r2, r0, r1, r2 mov oC0, r2


{

Random random = new Random(42);

FPA a = MakeRandomArray(3, 4, random);

FPA b = MakeRandomArray(3, 4, random);

FPA c = a + b;


float[,] result = gpuTarget.ToArray2D(c);

WriteArray(result);

}


int main() { // Create a GPGPU computing resource DX9Target *tgtDX9 = CreateDX9Target() ; // Declare some sample input arrays float xvalues[5] = {1, 2, 3, 4, 5} ; float yvalues[5] = {6, 7, 8, 9, 10} ; // Create data-parallel versions of inputs FPA x = FPA(xvalues, 5) ; FPA y = FPA(yvalues, 5) ; // Specify data-parallel computation FPA z = x + y ; // Computation does not occur here... // Allocate space for the result array float* zvalues = (float*) malloc (5 * sizeof(float)) ; // Execute the data-parallel computation on the GPU tgtDX9->ToArray(z, zvalues, 5) ; // z = x + y is now evaluated // Write out the result for (int i = 0; i < 5; i++) cout << zvalues[i] << " " ; cout << endl ; }

static float Horner(float[] coe, float x)

{

float result = 0.0f;

foreach (var c in coe)

{

result = result + x * c;

}

return result;

}

static FloatParallelArray Horner(float[] coe, FloatParallelArray x)

{

FloatParallelArray result = new FloatParallelArray(0.0f, x.Shape);

foreach (var c in coe)

{

result = result + x * c;

}

return result;

}

static float NormCdf(float x)

{

var coe = new []{ 0.0f, 0.31938153f, 0.356563782f, 1.781477937f, 1.821255978f, 1.330274429f };

float poly = Horner(coe, x);

float l = Math.Abs(x);

float k = (float) (1.0f/(1.0 + 0.2316419f*l));

float w = (float)(1.0f - 1.0f / Math.Sqrt(2.0f * Math.PI) * Math.Exp(-l * l / 2.0f) * poly * k);

if (x < 0)

return 1.0f - w;

else

return w;

}

static FloatParallelArray NormCdf(FloatParallelArray x)

{

var coe = new[] { 0.0f, 0.31938153f, 0.356563782f, 1.781477937f, 1.821255978f, 1.330274429f };

FloatParallelArray poly = Horner(coe, x);

FloatParallelArray l = ParallelArrays.Abs(x);

FloatParallelArray k = 1.0f / (1.0f + 0.2316419f * l);

FloatParallelArray e = new FloatParallelArray(2.718281828459045f, l.Shape);

FloatParallelArray w = 1.0f - 1.0f / (float)(Math.Sqrt(2.0f * Math.PI)) * ParallelArrays.Pow(e, -l * l / 2.0f) * poly * k;

return ParallelArrays.Select(x, w, 1.0f - w); }

if (x < 0)

return 1.0f - w;

else

return w;

ParallelArrays.Select(x, w, 1.0f - w);

1-w

w

x

static float BlackScholes1(float s, float x, float t, float r, float v)

{

float d1 = (float)((Math.Log(s / x) + (r + v * v / 2) * t) / (v * Math.Sqrt(t)));

float d2 = (float)(d1 - v * Math.Sqrt(t));

return (float)(s * NormCdf(d1) - x * Math.Exp(-r * t) * NormCdf(d2));

}

static FloatParallelArray BlackScholes1(FloatParallelArray ss, FloatParallelArray xs, FloatParallelArray ts, float r, float v)

{

FloatParallelArray d1 = ParallelArrays.Log2(ss / xs) + ((r + v * v / 2) * ts) / (v * ParallelArrays.Sqrt(ts));

FloatParallelArray d2 = (d1 - v * ParallelArrays.Sqrt(ts));

FloatParallelArray e = new FloatParallelArray(2.718281828459045f, ts.Shape);

return (ss * NormCdf(d1) - xs * ParallelArrays.Pow(e, -r * ts) * NormCdf(d2));

}

static float[] BlackScholes(float[] ss, float[] xs, float[] ts)

{

float r = 1.3f;

float v = 2.5f;

var result = new float[ss.GetLength(0)];

for (int i = 0; i < ss.GetLength(0); i++)

{

result[i] = BlackCholes1(ss[i], xs[i], ts[i], r, v);

}

return result;

}

static FloatParallelArray BlackScholes(FloatParallelArray ss, FloatParallelArray xs, FloatParallelArray ts)

{

float r = 1.3f;

float v = 2.5f;

return BlackCholes1(ss, xs, ts, r, v);

}

public static int[] SequentialFIRFunction(int[] weights, int[] input)

{

int[] window = new int[size];

int[] result = new int[input.Length];

// Clear to window of x values to all zero.

for (int w = 0; w < size; w++)

window[w] = 0;

// For each sample...

for (int i = 0; i < input.Length; i++)

{

// Shift in the new x value

for (int j = size - 1; j > 0; j--)

window[j] = window[j - 1];

window[0] = input[i];

// Compute the result value

int sum = 0;

for (int z = 0; z < size; z++)

sum += weights[z] * window[z];

result[i] = sum;

}

return result;

}

The Accidental Semi-colon

y = [y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7]]

y[0] = a[0]x[0] + a[1]x[-1] + a[2]x[-2] + a[3]x[-3] + a[4]x[-4] y[1] = a[0]x[1] + a[1]x[0] + a[2]x[-1] + a[3]x[-2] + a[4]x[-3] y[2] = a[0]x[2] + a[1]x[1] + a[2]x[0] + a[3]x[-1] + a[4]x[-2] y[3] = a[0]x[3] + a[1]x[2] + a[2]x[1] + a[3]x[0] + a[4]x[-1] y[4] = a[0]x[4] + a[1]x[3] + a[2]x[2] + a[3]x[1] + a[4]x[0] y[5] = a[0]x[5] + a[1]x[4] + a[2]x[3] + a[3]x[2] + a[4]x[1] y[6] = a[0]x[6] + a[1]x[5] + a[2]x[4] + a[3]x[3] + a[4]x[2] y[7] = a[0]x[7] + a[1]x[6] + a[2]x[5] + a[3]x[4] + a[4]x[3]

y = [y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7]] = a[0] * [x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]] +

a[1] * [x[-1], x[0], x[1], x[2], x[3], x[4], x[5], x[6]] +

a[2] * [x[-2], x[-1], x[0], x[1], x[2], x[3], x[4], x[5]] +

a[3] * [x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3], x[4]] +

a[4] * [x[-4], x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3]]

shift (x, 0) = [7, 2, 5, 9, 3, 8, 6, 4] = x

shift (x, -1) = [7, 7, 2, 5, 9, 3, 8, 6] shift (x, -2) = [7, 7, 7, 2, 5, 9, 3, 8]

y = [y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7]] = a[0] * [x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]] +

a[1] * [x[-1], x[0], x[1], x[2], x[3], x[4], x[5], x[6]] +

a[2] * [x[-2], x[-1], x[0], x[1], x[2], x[3], x[4], x[5]] +

a[3] * [x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3], x[4]] +

a[4] * [x[-4], x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3]]

y = a[0] * shift (x, 0) + a[1] * shift (x, -1) + a[2] * shift (x, -2) + a[3] * shift (x, -3) + a[4] * shift (x, -4)


using A = Microsoft.ParallelArrays.ParallelArrays;

namespace AcceleratorSamples

{

public class Convolver

{

public static float[] Convolver1D(Target computeTarget, float[] a, float[] x)

{

var xpar = new FloatParallelArray(x);

var n = x.Length;

var ypar = new FloatParallelArray(0.0f, new [] { n });

for (int i = 0; i < a.Length; i++)

ypar += a[i] * A.Shift(xpar, -i);

float[] result = computeTarget.ToArray1D(ypar);

return result;

}

}

}

for (int i = 0; i < a.Length; i++)

ypar += a[i] * A.Shift(xpar, -i);


using A = Microsoft.ParallelArrays.ParallelArrays;


{

public class Convolver

{

public static float[,] Convolver1D_2DInput

(Target computeTarget, float[] a, float[,] x)

{

var xpar = new FloatParallelArray(x);

var n = x.GetLength(0);

var m = x.GetLength(1);

var ypar = new FloatParallelArray(0.0f, new [] { n, m });

var shiftBy = new [] { 0, 0 };

for (var i = 0; i < a.Length; i++)

{

shiftBy[1] = -i;

ypar += a[i] * A.Shift(xpar, shiftBy);

}

var result = computeTarget.ToArray2D(ypar);

return result;

}

}

}

var shiftBy = new [] {0, 0} ; for (var i = 0; i < a.Length; i++)

{

shiftBy[1] = -i;

ypar += a[i] * A.Shift(xpar, shiftBy);

}

ps_3_0

dcl_2d s0

dcl_texcoord0 v0.xy

dcl_texcoord1 v1.xy

dcl_texcoord2 v2.xy

dcl_texcoord3 v3.xy

dcl_texcoord4 v4.xy

def c0, 0.054489, 0.054489, 0.054489, 0.054489

def c1, 0.000000, 0.000000, 0.000000, 0.000000

def c2, 0.244201, 0.244201, 0.244201, 0.244201

def c3, 0.402620, 0.402620, 0.402620, 0.402620

texld r0, v0, s0

mul r0, r0, c0

add r0, c1, r0

texld r1, v1, s0

mul r1, r1, c2

add r1, r0, r1

texld r2, v2, s0

mul r2, r2, c3

add r2, r1, r2

texld r3, v3, s0

mul r3, r3, c2

add r3, r2, r3

texld r4, v4, s0

mul r4, r4, c0

add r4, r3, r4

mov oC0, r4

using System;



{

public class Convolver2D

{

static FloatParallelArray convolve(Func<int, int[]> shifts, float[] kernel, int i, FloatParallelArray a)

{

FloatParallelArray e = kernel[i] * ParallelArrays.Shift(a, shifts(i));

if (i == 0)

return e;

else

return e + convolve(shifts, kernel, i - 1, a);

}

static FloatParallelArray convolveXY(float[] kernel, FloatParallelArray input)

{

FloatParallelArray convolveX

= convolve(i => new [] { -i, 0 }, kernel, kernel.Length - 1, input);

return convolve(i => new [] { 0, -i }, kernel, kernel.Length - 1, convolveX);

}


{

const int inputSize = 10;

var random = new Random(42);

var inputData = new float[inputSize, inputSize];

for (int row = 0; row < inputSize; row++)

for (int col = 0; col < inputSize; col++)

inputData[row, col] = (float)random.NextDouble() * random.Next(1, 100);

var testKernel = new float[]{2, 5, 7, 4, 3} ;


var inputArray = new FloatParallelArray(inputData);

var result = dx9Target.ToArray2D(convolveXY (testKernel, inputArray));

for (var row = 0; row < inputSize; row++)

{

for (var col = 0; col < inputSize; col++)

Console.Write("{0} ", result[row, col]);


}

}

}

}

static FloatParallelArray convolve(Func<int, int[]> shifts, float[] kernel, int i, FloatParallelArray a)

{

FloatParallelArray e = kernel[i] * ParallelArrays.Shift(a, shifts(i));

if (i == 0)

return e;

else

return e + convolve(shifts, kernel, i - 1, a);

}

static FloatParallelArray convolveXY(float[] kernel, FloatParallelArray input)

{

FloatParallelArray convolveX

= convolve(i => new [] { -i, 0 }, kernel, kernel.Length - 1, input);

return convolve(i => new [] { 0, -i }, kernel, kernel.Length - 1, convolveX);

}

using System;

using System.Linq;



{

static class Convolver2D

{

static FloatParallelArray convolve(this FloatParallelArray a,

Func<int, int[]> shifts, float[] kernel)

{

return kernel

.Select((k, i) => k * ParallelArrays.Shift(a, shifts(i)))

.Aggregate((a1, a2) => a1 + a2);

}

static FloatParallelArray convolveXY(this FloatParallelArray input, float[] kernel)

{

return input

.convolve(i => new[] { -i, 0 }, kernel)

.convolve(i => new[] { 0, -i }, kernel);

}


{

const int inputSize = 10;

var random = new Random(42);

var inputData = new float[inputSize, inputSize];

for (int row = 0; row < inputSize; row++)


inputData[row, col] = (float)random.NextDouble() * random.Next(1, 100);

var testKernel = new[] { 2F, 5, 7, 4, 3 };


var inputArray = new FloatParallelArray(inputData);

var result = dx9Target.ToArray2D(inputArray.convolveXY(testKernel));

for (var row = 0; row < inputSize; row++)

{


Console.Write("{0} ", result[row, col]);


}

}

}

}

static FloatParallelArray convolve(this FloatParallelArray a,

Func<int, int[]> shifts, float[] kernel)

{ return kernel

.Select((k, i) => k * ParallelArrays.Shift(a, shifts(i)))

.Aggregate((a1, a2) => a1 + a2);

}

static FloatParallelArray convolveXY(this FloatParallelArray input, float[] kernel)

{ return input

.convolve(i => new[] { -i, 0 }, kernel)

.convolve(i => new[] { 0, -i }, kernel);

}

FPA ConvolveXY(Target &tgt, int height, int width, int filterSize, float filter[],

FPA input, float *resultArray)

{

// Convolve in X (row) direction.

size_t dims[] = {height,width};

FPA smoothX = FPA(0,dims, 2);

intptr_t counts[] = {0,0};

int filterHalf = filterSize/2;

float scale;

for (int i = -filterHalf; i <= filterHalf; i++)

{

counts[0] = i;

scale = filter[i + filterHalf];

smoothX += Shift(input, counts, 2) * scale;

}

// Convolve in Y (col) direction.

counts[0] = 0;

FPA result = FPA(0,dims, 2);

for (int i = -filterHalf; i <= filterHalf; i++)

{

counts[1] = i;

scale = filter[filterHalf + i];

result += Shift(smoothX, counts, 2) * scale;

}

tgt.ToArray(result, resultArray, height, width, width * sizeof(float));

return smoothX ;

};

open System

open Microsoft.ParallelArrays

[<EntryPoint>]

let main(args) =

// Declare a filter kernel for the convolution

let testKernel = Array.map float32 [| 2; 5; 7; 4; 3 |]

// Specify the size of each dimension of the input array

let inputSize = 10

// Create a pseudo-random number generator

let random = Random (42)

// Declare a psueduo-input data array

let testData = Array2D.init inputSize inputSize (fun i j -> float32 (random.NextDouble() *

float (random.Next(1, 100))))

// Create an Accelerator float parallel array for the F# input array

use testArray = new FloatParallelArray(testData)

// Declare a function to convolve in the X or Y direction

let rec convolve (shifts : int -> int []) (kernel : float32 []) i (a : FloatParallelArray)

= let e = kernel.[i] * ParallelArrays.Shift(a, shifts i)

if i = 0 then

e

else

e + convolve shifts kernel (i-1) a

// Declare a 2D convolver

let convolveXY kernel input

= // First convolve in the X direction and then in the Y direction

let convolveX = convolve (fun i -> [| -i; 0 |]) kernel (kernel.Length - 1) input

let convolveY = convolve (fun i -> [| 0; -i |]) kernel (kernel.Length - 1) convolveX

convolveY

// Create a DX9 target and use it to convolve the test input

use dx9Target = new DX9Target()

let convolveDX9 = dx9Target.ToArray2D (convolveXY testKernel testArray)

printfn "DX9: -> \r\n%A" convolveDX9

0

let convolveXY kernel input

= // First convolve in the X direction and then in Y

let convolveX = convolve (fun i -> [| -i; 0 |]) kernel (kernel.Length - 1) input

let convolveY = convolve (fun i -> [| 0; -i |]) kernel (kernel.Length - 1) convolveX

convolveY

0

5

10

15

20

25

30

35

40

0 20 40 60 80 100 120

spe

ed

up

ove

r se

qu

en

tial

ve

rsio

n

kernel size

Speedup using Accelerator GPU and SSE3 multicore targets for a 8000x8000 convolver

ATI Radeon HD5870 (DX9)

NVidia 470 GTX (DX9)

SSE3 Intel Xeon E7450 12 cores

Convolver

8.249ns max delay 3 x DSP48Es 63 slice registers 24 slice LUTs

Accelerator

Register for the Next GTC Express

GPU-enabled Macromolecular Simulation: Challenges and

Opportunities

Michela Taufer,, Assistant Professor, Department of Computer and

Information Sciences, University of Delaware

Thursday, December 1, 2011, 9:00 AM PDT

Register at www.gputechconf.com

Documents

Heterogeneous Data-Parallel Programming - Nvidiadeveloper.download.nvidia.com/GTC/Satnam_Singh...XC6VLX760 758,784 logic cells, 864 DSP blocks, 1,440 dual ported 18Kb RAMs 32-bit integer