Upload
others
View
12
Download
0
Embed Size (px)
Citation preview
Heterogeneous Data-Parallel Programming
Satnam Singh, The University of Birmingham, UK
locks monitors condition variables spin locks priority inversion
XC6VLX760 758,784 logic cells, 864 DSP blocks, 1,440 dual ported 18Kb RAMs
32-bit integer Adder (32/474,240) >700MHz
332x1440
14820 sim-adds 1,037,400,000,000 additions/second
XD2000i FPGA in-socket accelerator for Intel FSB
XD2000F FPGA in-socket accelerator for AMD socket F
XD1000 FPGA co-processor module for socket 940
time
information processing gap
Vo
lum
e O
f In
form
atio
n
processing with multi-core processors
processing with specialized processors and heterogeneous systems
2011
CUDAC Programmers
Java/Python/C#/F#/Perl Programmer
OpenMP
#include <stdio.h>
int main(int argc, char* argv[])
{
const unsigned int n = 5000000 ;
float *a = new float[n];
float *b = new float[n];
float *c = new float[n];
int i, j ;
#pragma omp parallel for
for (i=0; i<n; i++)
c[i] = a[i] + b[i] ;
return 0;
}
SSE2: ADDPS __m128 _mm_add_ps (__m128 a , __m128 b );
r0 := x0 + y0 r1 := x1 + y1 r2 := x2 + y2 r3 := x3 + y3
128-bits MMX/
universal language?
embedded high level software
FPGA
GPU
DSP
machine learning
grand unification theory polygots
Gannet
DSLs
Microsoft Redmond Accelerator Team
Barry Bond Kerry Hammil
Lubomir Litchev <anonymous other person>
Effort vs. Reward (Productivity)
low effort low reward
high effort high reward
medium effort medium reward
CUDAC OpenCL HLSL DirectCompute
Thurst Accelerator
Accelerator
DRAM
using System;
using Microsoft.ParallelArrays;
namespace AddArraysPointwise
{
class AddArraysPointwiseDX9
{
static void Main(string[] args)
{
var x = new FloatParallelArray (new[] {1.0F, 2, 3, 4, 5});
var y = new FloatParallelArray (new[] {6.0F, 7, 8, 9, 10});
var dx9Target = new DX9Target();
var z = x + y;
foreach (var i in dx9Target.ToArray1D (z))
Console.Write(i + " ");
Console.WriteLine();
}
}
}
ps_3_0 dcl_2d s0 dcl_texcoord0 v0.xy dcl_2d s1 texld r0, v0, s0 texld r1, v0, s1 add r1, r0, r1 mov oC0, r1
[1; 2; 3; 4; 5]
FloatParallelArray
CPU Address Space
C# Array
GPU Address Space
Encapsulated Data-parallel array
x
[6; 7; 8; 9; 10]
FloatParallelArray
y
100010101101011010
x+y GPU code
GPU memory
GPU code
y
[7; 9; 11; 13; 15]
C# Array
using System;
using Microsoft.ParallelArrays;
namespace AddArraysPointwiseMulticore
{
class AddArraysPointwiseMulticore
{
static void Main(string[] args)
{
var x = new FloatParallelArray (new[] {1.0F, 2, 3, 4, 5});
var y = new FloatParallelArray (new[] {6.0F, 7, 8, 9, 10});
var multicoreTarget = new X64MulticoreTarget();
var z = x + y;
foreach (var i in multicoreTarget.ToArray1D (z))
Console.Write(i + " ");
Console.WriteLine();
}
}
}
using System;
using Microsoft.ParallelArrays;
namespace AddArraysPointwiseFPGA
{
class AddArraysPointwiseMulticore
{
static void Main(string[] args)
{
var x = new FloatParallelArray (new[] {1.0F, 2, 3, 4, 5});
var y = new FloatParallelArray (new[] {6.0F, 7, 8, 9, 10});
var fpgaTarget = new FPGATarget();
var z = x + y;
fpgaTarget.ToArray1D (z) ;
}
}
}
open System
open Microsoft.ParallelArrays
let main(args) =
let x = new FloatParallelArray (Array.map float32 [|1; 2; 3; 4; 5 |])
let y = new FloatParallelArray (Array.map float32 [|6; 7; 8; 9; 10 |])
let z = x + y
use dx9Target = new DX9Target()
let zv = dx9Target.ToArray1D(z)
printf "%A\n" zv
0
rX *
pa
Shift (0,0)
k[0]
+
+
*
Shift (0,1)
k[1]
+
…
let rec convolve (shifts : int -> int []) (kernel : float32 []) i (a : FloatParallelArray)
= let e = kernel.[i] * ParallelArrays.Shift(a, shifts i)
if i = 0 then
e
else
e + convolve shifts kernel (i-1) a
namespace AddArrays1D
{
class AddArrays1D
{
static void Main(string[] args)
{
FloatParallelArray a = new FloatParallelArray(new float[] {1.0f, 2.0f, 3.0f, 4.0f});
FloatParallelArray b = new FloatParallelArray(new float[] {5.0f, 6.0f, 7.0f, 8.0f });
FloatParallelArray c = a + b;
Target gpuTarget = new DX9Target();
float[] result = gpuTarget.ToArray1D(c);
foreach (float f in result)
Console.Write(f + " ");
Console.WriteLine();
}
}
}
ps_3_0 dcl_2d s0 dcl_texcoord0 v0.xy dcl_2d s1 texld r0, v0, s0 texld r1, v0, s1 add r1, r0, r1 mov oC0, r1
using System;
using Microsoft.ParallelArrays;
using FPA = Microsoft.ParallelArrays.FloatParallelArray;
namespace MultiplyAdd1D
{
class MultiplyAdd1D
{
static void Main(string[] args)
{
FPA a = new FPA(new float[] { 1.0f, 2.0f, 3.0f, 4.0f });
FPA b = new FPA(new float[] { 5.0f, 6.0f, 7.0f, 8.0f });
FPA c = new FPA(new float[] { 9.0f, 10.0f, 11.0f, 12.0f });
FPA d = ParallelArrays.MultiplyAdd(a, b, c);
Target gpuTarget = new DX9Target();
float[] result = gpuTarget.ToArray1D(d);
foreach (float f in result)
Console.Write(f + " ");
Console.WriteLine();
}
}
}
ps_3_0 dcl_2d s0 dcl_texcoord0 v0.xy dcl_2d s1 dcl_2d s2 texld r0, v0, s0 texld r1, v0, s1 texld r2, v0, s2 mad r2, r0, r1, r2 mov oC0, r2
static void Main(string[] args)
{
Random random = new Random(42);
FPA a = MakeRandomArray(3, 4, random);
FPA b = MakeRandomArray(3, 4, random);
FPA c = a + b;
Target gpuTarget = new DX9Target();
float[,] result = gpuTarget.ToArray2D(c);
WriteArray(result);
}
ps_3_0 dcl_2d s0 dcl_texcoord0 v0.xy dcl_2d s1 texld r0, v0, s0 texld r1, v0, s1 add r1, r0, r1 mov oC0, r1
int main() { // Create a GPGPU computing resource DX9Target *tgtDX9 = CreateDX9Target() ; // Declare some sample input arrays float xvalues[5] = {1, 2, 3, 4, 5} ; float yvalues[5] = {6, 7, 8, 9, 10} ; // Create data-parallel versions of inputs FPA x = FPA(xvalues, 5) ; FPA y = FPA(yvalues, 5) ; // Specify data-parallel computation FPA z = x + y ; // Computation does not occur here... // Allocate space for the result array float* zvalues = (float*) malloc (5 * sizeof(float)) ; // Execute the data-parallel computation on the GPU tgtDX9->ToArray(z, zvalues, 5) ; // z = x + y is now evaluated // Write out the result for (int i = 0; i < 5; i++) cout << zvalues[i] << " " ; cout << endl ; }
static float Horner(float[] coe, float x)
{
float result = 0.0f;
foreach (var c in coe)
{
result = result + x * c;
}
return result;
}
static FloatParallelArray Horner(float[] coe, FloatParallelArray x)
{
FloatParallelArray result = new FloatParallelArray(0.0f, x.Shape);
foreach (var c in coe)
{
result = result + x * c;
}
return result;
}
static float NormCdf(float x)
{
var coe = new []{ 0.0f, 0.31938153f, 0.356563782f, 1.781477937f, 1.821255978f, 1.330274429f };
float poly = Horner(coe, x);
float l = Math.Abs(x);
float k = (float) (1.0f/(1.0 + 0.2316419f*l));
float w = (float)(1.0f - 1.0f / Math.Sqrt(2.0f * Math.PI) * Math.Exp(-l * l / 2.0f) * poly * k);
if (x < 0)
return 1.0f - w;
else
return w;
}
static FloatParallelArray NormCdf(FloatParallelArray x)
{
var coe = new[] { 0.0f, 0.31938153f, 0.356563782f, 1.781477937f, 1.821255978f, 1.330274429f };
FloatParallelArray poly = Horner(coe, x);
FloatParallelArray l = ParallelArrays.Abs(x);
FloatParallelArray k = 1.0f / (1.0f + 0.2316419f * l);
FloatParallelArray e = new FloatParallelArray(2.718281828459045f, l.Shape);
FloatParallelArray w = 1.0f - 1.0f / (float)(Math.Sqrt(2.0f * Math.PI)) * ParallelArrays.Pow(e, -l * l / 2.0f) * poly * k;
return ParallelArrays.Select(x, w, 1.0f - w); }
if (x < 0)
return 1.0f - w;
else
return w;
ParallelArrays.Select(x, w, 1.0f - w);
1-w
w
x
static float BlackScholes1(float s, float x, float t, float r, float v)
{
float d1 = (float)((Math.Log(s / x) + (r + v * v / 2) * t) / (v * Math.Sqrt(t)));
float d2 = (float)(d1 - v * Math.Sqrt(t));
return (float)(s * NormCdf(d1) - x * Math.Exp(-r * t) * NormCdf(d2));
}
static FloatParallelArray BlackScholes1(FloatParallelArray ss, FloatParallelArray xs, FloatParallelArray ts, float r, float v)
{
FloatParallelArray d1 = ParallelArrays.Log2(ss / xs) + ((r + v * v / 2) * ts) / (v * ParallelArrays.Sqrt(ts));
FloatParallelArray d2 = (d1 - v * ParallelArrays.Sqrt(ts));
FloatParallelArray e = new FloatParallelArray(2.718281828459045f, ts.Shape);
return (ss * NormCdf(d1) - xs * ParallelArrays.Pow(e, -r * ts) * NormCdf(d2));
}
static float[] BlackScholes(float[] ss, float[] xs, float[] ts)
{
float r = 1.3f;
float v = 2.5f;
var result = new float[ss.GetLength(0)];
for (int i = 0; i < ss.GetLength(0); i++)
{
result[i] = BlackCholes1(ss[i], xs[i], ts[i], r, v);
}
return result;
}
static FloatParallelArray BlackScholes(FloatParallelArray ss, FloatParallelArray xs, FloatParallelArray ts)
{
float r = 1.3f;
float v = 2.5f;
return BlackCholes1(ss, xs, ts, r, v);
}
public static int[] SequentialFIRFunction(int[] weights, int[] input)
{
int[] window = new int[size];
int[] result = new int[input.Length];
// Clear to window of x values to all zero.
for (int w = 0; w < size; w++)
window[w] = 0;
// For each sample...
for (int i = 0; i < input.Length; i++)
{
// Shift in the new x value
for (int j = size - 1; j > 0; j--)
window[j] = window[j - 1];
window[0] = input[i];
// Compute the result value
int sum = 0;
for (int z = 0; z < size; z++)
sum += weights[z] * window[z];
result[i] = sum;
}
return result;
}
The Accidental Semi-colon
y = [y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7]]
y[0] = a[0]x[0] + a[1]x[-1] + a[2]x[-2] + a[3]x[-3] + a[4]x[-4] y[1] = a[0]x[1] + a[1]x[0] + a[2]x[-1] + a[3]x[-2] + a[4]x[-3] y[2] = a[0]x[2] + a[1]x[1] + a[2]x[0] + a[3]x[-1] + a[4]x[-2] y[3] = a[0]x[3] + a[1]x[2] + a[2]x[1] + a[3]x[0] + a[4]x[-1] y[4] = a[0]x[4] + a[1]x[3] + a[2]x[2] + a[3]x[1] + a[4]x[0] y[5] = a[0]x[5] + a[1]x[4] + a[2]x[3] + a[3]x[2] + a[4]x[1] y[6] = a[0]x[6] + a[1]x[5] + a[2]x[4] + a[3]x[3] + a[4]x[2] y[7] = a[0]x[7] + a[1]x[6] + a[2]x[5] + a[3]x[4] + a[4]x[3]
y = [y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7]] = a[0] * [x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]] +
a[1] * [x[-1], x[0], x[1], x[2], x[3], x[4], x[5], x[6]] +
a[2] * [x[-2], x[-1], x[0], x[1], x[2], x[3], x[4], x[5]] +
a[3] * [x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3], x[4]] +
a[4] * [x[-4], x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3]]
shift (x, 0) = [7, 2, 5, 9, 3, 8, 6, 4] = x
shift (x, -1) = [7, 7, 2, 5, 9, 3, 8, 6] shift (x, -2) = [7, 7, 7, 2, 5, 9, 3, 8]
y = [y[0], y[1], y[2], y[3], y[4], y[5], y[6], y[7]] = a[0] * [x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]] +
a[1] * [x[-1], x[0], x[1], x[2], x[3], x[4], x[5], x[6]] +
a[2] * [x[-2], x[-1], x[0], x[1], x[2], x[3], x[4], x[5]] +
a[3] * [x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3], x[4]] +
a[4] * [x[-4], x[-3], x[-2], x[-1], x[0], x[1], x[2], x[3]]
y = a[0] * shift (x, 0) + a[1] * shift (x, -1) + a[2] * shift (x, -2) + a[3] * shift (x, -3) + a[4] * shift (x, -4)
using Microsoft.ParallelArrays;
using A = Microsoft.ParallelArrays.ParallelArrays;
namespace AcceleratorSamples
{
public class Convolver
{
public static float[] Convolver1D(Target computeTarget, float[] a, float[] x)
{
var xpar = new FloatParallelArray(x);
var n = x.Length;
var ypar = new FloatParallelArray(0.0f, new [] { n });
for (int i = 0; i < a.Length; i++)
ypar += a[i] * A.Shift(xpar, -i);
float[] result = computeTarget.ToArray1D(ypar);
return result;
}
}
}
for (int i = 0; i < a.Length; i++)
ypar += a[i] * A.Shift(xpar, -i);
using Microsoft.ParallelArrays;
using A = Microsoft.ParallelArrays.ParallelArrays;
namespace AcceleratorSamples
{
public class Convolver
{
public static float[,] Convolver1D_2DInput
(Target computeTarget, float[] a, float[,] x)
{
var xpar = new FloatParallelArray(x);
var n = x.GetLength(0);
var m = x.GetLength(1);
var ypar = new FloatParallelArray(0.0f, new [] { n, m });
var shiftBy = new [] { 0, 0 };
for (var i = 0; i < a.Length; i++)
{
shiftBy[1] = -i;
ypar += a[i] * A.Shift(xpar, shiftBy);
}
var result = computeTarget.ToArray2D(ypar);
return result;
}
}
}
var shiftBy = new [] {0, 0} ; for (var i = 0; i < a.Length; i++)
{
shiftBy[1] = -i;
ypar += a[i] * A.Shift(xpar, shiftBy);
}
ps_3_0
dcl_2d s0
dcl_texcoord0 v0.xy
dcl_texcoord1 v1.xy
dcl_texcoord2 v2.xy
dcl_texcoord3 v3.xy
dcl_texcoord4 v4.xy
def c0, 0.054489, 0.054489, 0.054489, 0.054489
def c1, 0.000000, 0.000000, 0.000000, 0.000000
def c2, 0.244201, 0.244201, 0.244201, 0.244201
def c3, 0.402620, 0.402620, 0.402620, 0.402620
texld r0, v0, s0
mul r0, r0, c0
add r0, c1, r0
texld r1, v1, s0
mul r1, r1, c2
add r1, r0, r1
texld r2, v2, s0
mul r2, r2, c3
add r2, r1, r2
texld r3, v3, s0
mul r3, r3, c2
add r3, r2, r3
texld r4, v4, s0
mul r4, r4, c0
add r4, r3, r4
mov oC0, r4
using System;
using Microsoft.ParallelArrays;
namespace AcceleratorSamples
{
public class Convolver2D
{
static FloatParallelArray convolve(Func<int, int[]> shifts, float[] kernel, int i, FloatParallelArray a)
{
FloatParallelArray e = kernel[i] * ParallelArrays.Shift(a, shifts(i));
if (i == 0)
return e;
else
return e + convolve(shifts, kernel, i - 1, a);
}
static FloatParallelArray convolveXY(float[] kernel, FloatParallelArray input)
{
FloatParallelArray convolveX
= convolve(i => new [] { -i, 0 }, kernel, kernel.Length - 1, input);
return convolve(i => new [] { 0, -i }, kernel, kernel.Length - 1, convolveX);
}
static void Main(string[] args)
{
const int inputSize = 10;
var random = new Random(42);
var inputData = new float[inputSize, inputSize];
for (int row = 0; row < inputSize; row++)
for (int col = 0; col < inputSize; col++)
inputData[row, col] = (float)random.NextDouble() * random.Next(1, 100);
var testKernel = new float[]{2, 5, 7, 4, 3} ;
var dx9Target = new DX9Target();
var inputArray = new FloatParallelArray(inputData);
var result = dx9Target.ToArray2D(convolveXY (testKernel, inputArray));
for (var row = 0; row < inputSize; row++)
{
for (var col = 0; col < inputSize; col++)
Console.Write("{0} ", result[row, col]);
Console.WriteLine();
}
}
}
}
static FloatParallelArray convolve(Func<int, int[]> shifts, float[] kernel, int i, FloatParallelArray a)
{
FloatParallelArray e = kernel[i] * ParallelArrays.Shift(a, shifts(i));
if (i == 0)
return e;
else
return e + convolve(shifts, kernel, i - 1, a);
}
static FloatParallelArray convolveXY(float[] kernel, FloatParallelArray input)
{
FloatParallelArray convolveX
= convolve(i => new [] { -i, 0 }, kernel, kernel.Length - 1, input);
return convolve(i => new [] { 0, -i }, kernel, kernel.Length - 1, convolveX);
}
using System;
using System.Linq;
using Microsoft.ParallelArrays;
namespace AcceleratorSamples
{
static class Convolver2D
{
static FloatParallelArray convolve(this FloatParallelArray a,
Func<int, int[]> shifts, float[] kernel)
{
return kernel
.Select((k, i) => k * ParallelArrays.Shift(a, shifts(i)))
.Aggregate((a1, a2) => a1 + a2);
}
static FloatParallelArray convolveXY(this FloatParallelArray input, float[] kernel)
{
return input
.convolve(i => new[] { -i, 0 }, kernel)
.convolve(i => new[] { 0, -i }, kernel);
}
static void Main(string[] args)
{
const int inputSize = 10;
var random = new Random(42);
var inputData = new float[inputSize, inputSize];
for (int row = 0; row < inputSize; row++)
for (int col = 0; col < inputSize; col++)
inputData[row, col] = (float)random.NextDouble() * random.Next(1, 100);
var testKernel = new[] { 2F, 5, 7, 4, 3 };
var dx9Target = new DX9Target();
var inputArray = new FloatParallelArray(inputData);
var result = dx9Target.ToArray2D(inputArray.convolveXY(testKernel));
for (var row = 0; row < inputSize; row++)
{
for (int col = 0; col < inputSize; col++)
Console.Write("{0} ", result[row, col]);
Console.WriteLine();
}
}
}
}
static FloatParallelArray convolve(this FloatParallelArray a,
Func<int, int[]> shifts, float[] kernel)
{ return kernel
.Select((k, i) => k * ParallelArrays.Shift(a, shifts(i)))
.Aggregate((a1, a2) => a1 + a2);
}
static FloatParallelArray convolveXY(this FloatParallelArray input, float[] kernel)
{ return input
.convolve(i => new[] { -i, 0 }, kernel)
.convolve(i => new[] { 0, -i }, kernel);
}
FPA ConvolveXY(Target &tgt, int height, int width, int filterSize, float filter[],
FPA input, float *resultArray)
{
// Convolve in X (row) direction.
size_t dims[] = {height,width};
FPA smoothX = FPA(0,dims, 2);
intptr_t counts[] = {0,0};
int filterHalf = filterSize/2;
float scale;
for (int i = -filterHalf; i <= filterHalf; i++)
{
counts[0] = i;
scale = filter[i + filterHalf];
smoothX += Shift(input, counts, 2) * scale;
}
// Convolve in Y (col) direction.
counts[0] = 0;
FPA result = FPA(0,dims, 2);
for (int i = -filterHalf; i <= filterHalf; i++)
{
counts[1] = i;
scale = filter[filterHalf + i];
result += Shift(smoothX, counts, 2) * scale;
}
tgt.ToArray(result, resultArray, height, width, width * sizeof(float));
return smoothX ;
};
open System
open Microsoft.ParallelArrays
[<EntryPoint>]
let main(args) =
// Declare a filter kernel for the convolution
let testKernel = Array.map float32 [| 2; 5; 7; 4; 3 |]
// Specify the size of each dimension of the input array
let inputSize = 10
// Create a pseudo-random number generator
let random = Random (42)
// Declare a psueduo-input data array
let testData = Array2D.init inputSize inputSize (fun i j -> float32 (random.NextDouble() *
float (random.Next(1, 100))))
// Create an Accelerator float parallel array for the F# input array
use testArray = new FloatParallelArray(testData)
// Declare a function to convolve in the X or Y direction
let rec convolve (shifts : int -> int []) (kernel : float32 []) i (a : FloatParallelArray)
= let e = kernel.[i] * ParallelArrays.Shift(a, shifts i)
if i = 0 then
e
else
e + convolve shifts kernel (i-1) a
// Declare a 2D convolver
let convolveXY kernel input
= // First convolve in the X direction and then in the Y direction
let convolveX = convolve (fun i -> [| -i; 0 |]) kernel (kernel.Length - 1) input
let convolveY = convolve (fun i -> [| 0; -i |]) kernel (kernel.Length - 1) convolveX
convolveY
// Create a DX9 target and use it to convolve the test input
use dx9Target = new DX9Target()
let convolveDX9 = dx9Target.ToArray2D (convolveXY testKernel testArray)
printfn "DX9: -> \r\n%A" convolveDX9
0
let convolveXY kernel input
= // First convolve in the X direction and then in Y
let convolveX = convolve (fun i -> [| -i; 0 |]) kernel (kernel.Length - 1) input
let convolveY = convolve (fun i -> [| 0; -i |]) kernel (kernel.Length - 1) convolveX
convolveY
0
5
10
15
20
25
30
35
40
0 20 40 60 80 100 120
spe
ed
up
ove
r se
qu
en
tial
ve
rsio
n
kernel size
Speedup using Accelerator GPU and SSE3 multicore targets for a 8000x8000 convolver
ATI Radeon HD5870 (DX9)
NVidia 470 GTX (DX9)
SSE3 Intel Xeon E7450 12 cores
Convolver
8.249ns max delay 3 x DSP48Es 63 slice registers 24 slice LUTs
Accelerator
Register for the Next GTC Express
GPU-enabled Macromolecular Simulation: Challenges and
Opportunities
Michela Taufer,, Assistant Professor, Department of Computer and
Information Sciences, University of Delaware
Thursday, December 1, 2011, 9:00 AM PDT
Register at www.gputechconf.com