Upload
others
View
26
Download
1
Embed Size (px)
Citation preview
Optimizer’s Toolbox Fast CUDA Techniques For Real-time Image Processing
Presenter: Sarah Kabala ([email protected])
“I think that’s impossible.”
Warp = 32 Threads
Registers (<= 255 * 32) W
R
Shared Memory W
R
L1 / Texture Cache R
L2 Cache
Main Memory W
R
W
R
Block = ? Warps
. . .
Shared Memory W
R
L1 / Texture Cache R
L2 Cache
Main Memory W
R
W
R
Memory
GP100
. . .
SM
Reg
SMem L1/Tex
SM
x255
64KB 24KB
SM
x8160
banks lines
L2 (4096KB)
Main Memory (16GB)
Latency
😄 R
😥 L1
😱 L2
RAM
__global__ // CUDA
void kernel_calcOtsuThreshold(int *thresholdPtr, const int *histogram, const int numPixels)
{
// Reference: https://en.wikipedia.org/wiki/Otsu%27s_Method
CUDA_ASSERT( blockDim.x == 32 ); // warp
CUDA_ASSERT( blockDim.x * blockDim.y == 256 ); // thresholds to try
CUDA_ASSERT( gridDim.x == 1 );
CUDA_ASSERT( gridDim.y == 1 );
// shared memory for block
__shared__ float cdf[256 + 1]; // table of cumulative probability
__shared__ float com[256 + 1]; // table of center of mass
__shared__ int maxVariance; // maximum inter-class variance
const int threshold = threadIdx.y * blockDim.x + threadIdx.x;
// 1. Accumulate table values
// set initial values
if (threshold == 0)
{
cdf[0] = 0.0f;
com[0] = 0.0f;
maxVariance = 0;
*thresholdPtr = 0;
}
// use only the first warp to avoid atomics and syncthread
if (threadIdx.y == 0)
{
const float normalize = 1.0f / numPixels;
float laneSum = 0.0f; // this thread lane's probability sum
float laneMean = 0.0f; // this thread lane's (center * probability) sum
for (int bin = threshold; bin < 256; bin += 32)
{
//float warpSum = 1.0f; // DEBUG: Internal test of CDF correctness
float warpSum = histogram[bin] * normalize; // warp sum of bin probabilities
CUDA_ASSERT(warpSum >= 0.0f);
float warpMean = bin * warpSum; // warp sum of bin (center * probability)
laneSum += warpSum;
laneMean += warpMean;
// prefix sum for this window of histogram
for (int laneMask = 0x01; laneMask < 32; laneMask <<= 1) // increasing powers of 2
{
float passSum = __shfl_xor(warpSum, laneMask);
float passMean = __shfl_xor(warpMean, laneMask);
// accumulate only a forward pass to this thread lane -- ! 50% divergence
if (threadIdx.x & laneMask)
{
laneSum += passSum;
laneMean += passMean;
}
// accumulate full warp reduction
warpSum += passSum;
warpMean += passMean;
}
CUDA_ASSERT(warpSum == __shfl(warpSum, 32 - 1));
CUDA_ASSERT(warpMean == __shfl(warpMean, 32 - 1));
// set lane values for this window of histogram
const int i = bin + 1;
CUDA_ASSERT(laneSum >= 0.0f);
cdf[i] = laneSum;
CUDA_ASSERT(laneMean >= 0.0f);
com[i] = laneMean;
//CUDA_ASSERT(laneSum == (float)i); // DEBUG: Internal test of CDF correctness
// update all lane values to the last lane's values before moving on to next hist window
CUDA_ASSERT(laneSum <= __shfl_down(laneSum, 1) + /*tolerance*/2e-3f); // increasing
CUDA_ASSERT(laneMean <= __shfl_down(laneMean, 1) + /*tolerance*/2e-3f); // increasing
laneSum = __shfl(laneSum, 32 - 1);
laneMean = __shfl(laneMean, 32 - 1);
}
// final laneSum should be 1.0 but might drift a little due to FP error
CUDA_ASSERT(abs(laneSum - 1.0f) < /*tolerance*/1e-3f);
}
__syncthreads(); // ---------------------------------------------------
// 2. Determine maximum inter-class variance
float variance_f = 0.0f; // inter-class variance
const float w0 = cdf[threshold];
const float w1 = max(0.0f, 1.0f - w0);
const float u0 = com[threshold] / w0; // ! possible divide-by-0
const float u1 = (com[256] - u0) / w1; // ! possible divide-by-0
if (w0 > 0.0f && w1 > 0.0f) // ! possible divergence
{
variance_f = max(0.0f, w0 * w1 * (u0 - u1) * (u0 - u1));
}
CUDA_ASSERT(variance_f >= 0.0f);
const int variance_i = __float_as_int(variance_f);
// shuffle reduce the warp maximum variance to call 8 atomics instead of 256 below
variance_f = max(__shfl_xor(variance_f, 0x01), variance_f);
variance_f = max(__shfl_xor(variance_f, 0x02), variance_f);
variance_f = max(__shfl_xor(variance_f, 0x04), variance_f);
variance_f = max(__shfl_xor(variance_f, 0x08), variance_f);
variance_f = max(__shfl_xor(variance_f, 0x10), variance_f);
if (threadIdx.x == 0)
{
// float-as-int comparisons are correct because all variances are positive
atomicMax(&maxVariance, __float_as_int(variance_f));
}
__syncthreads(); // ---------------------------------------------------
// 3. Determine maximum threshold value that has the maxVariance
if (variance_i == maxVariance)
{
atomicMax(thresholdPtr, threshold);
}
}
Flat Tires
Atomics
Shared memory bank conflicts
Uncoalesced L1 or L2 read/write
Reading the same cacheline more than once from L2
Writing the same cacheline more than once to L2
All main memory read/write
Thread divergence
__syncthreads()
Racing Stripes
Hardware filter with texfetch
Consume input cacheline with exactly 1 warp
Produce output cacheline with exactly 1 warp
Buffer re-used values in registers instead of re-reading
Accumulate/reduce/prefix-sum with warp shuffles
Use thread lane 0 for atomics
Call NPP for basic, unfused ops
Call cuFFT for convolution, correlation, high/low pass
Input Decimation
Free Bilerp
Warp Speed
__all
__any
__ballot
__shfl
__shfl_up
__shfl_down
__shfl_xor
Statistics
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = terms[lane], pass; sum += __shfl_xor(sum, 0x10); sum += __shfl_xor(sum, 0x08); sum += __shfl_xor(sum, 0x04); sum += __shfl_xor(sum, 0x02); sum += __shfl_xor(sum, 0x01); CUDA_ASSERT(sum == __shfl(sum, 0));
0 1 2 3 4 5 6 7
0 1 2 3 4 5 6 7
0 1 2 3 4 5 6 7
0 1 2 3 4 5 6 7
Prefix Sum
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = terms[lane], pass; int presum = terms[lane]; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
0 1 2 3 4 5 6 7
0 1 2 3 4 5 6 7
0 1 2 3 4 5 6 7
0 1 2 3 4 5 6 7
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
Values Seen
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 1 2 3 4 5 6 7 8
Prefix Sum 1 2 3 4 5 6 7 8
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
Values Seen
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 3 3 7 7 11 11 15 15
Prefix Sum 1 2 3 4 5 6 7 8
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
Values Seen
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 3 3 7 7 11 11 15 15
Prefix Sum 1 3 3 7 5 11 7 15
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
Values Seen
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 10 10 10 10 26 26 26 26
Prefix Sum 1 3 3 7 5 11 7 15
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
Values Seen
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 10 10 10 10 26 26 26 26
Prefix Sum 1 3 6 10 5 11 18 26
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
Values Seen
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 36 36 36 36 36 36 36 36
Prefix Sum 1 3 6 10 5 11 18 26
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 36 36 36 36 36 36 36 36
Prefix Sum 1 3 6 10 15 21 28 36
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
Step Through: Sum & Prefix Sum
Lane 0 1 2 3 4 5 6 7
1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2
3 3 3 3 3 3 3 3
4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5
6 6 6 6 6 6 6 6
7 7 7 7 7 7 7 7
8 8 8 8 8 8 8 8
Sum 36 36 36 36 36 36 36 36
Prefix Sum 1 3 6 10 15 21 28 36
CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);
int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));
CDF from Histogram
int fullsum = 0, temp[256 / 32]; for (int i = lane; i < 256; i += 32) { int presum = histogram[i]; fullsum += prefixSum(presum); temp[i] = presum; } CUDA_ASSERT(fullsum > 0); for (int i = lane; i < 256; i += 32) { const int window = i >> 5; cdf[i] = temp[window] / (float)fullsum; }
The Frequency Domain
Spatial Domain Frequency Domain
Convolution Element-wise Multiplication
Deconvolution Element-wise Division
Correlation Element-wise Multiplication
Noise Reduction Low-pass Filter
Edge Detection High-pass Filter
Discrete Fourier Transform
Natural RGB Wave Magnitudes
DFT
N number of samples
xn nth pixel value
Xk kth frequency value
cuFFT for Images
FFT Inverse FFT
int fft(cufftComplex *freqImg,
cufftReal *lumImg,
const int w, const int h)
{
cufftHandle fwd;
cufftPlan2d(&fwd, h, w, CUFFT_R2C);
cufftExecR2C(fwd, lumImg, freqImg);
cufftDestroy(fwd);
return (w * (h / 2 + 1)); // = n
}
void ifft(cufftComplex *freqImg,
cufftReal *lumImg,
const int w, const int h)
{
cufftHandle inv;
cufftPlan2d(&inv, h, w, CUFFT_C2R);
cufftExecC2R(inv, fftImg, lumImg);
cufftDestroy(inv);
}
//Divide lumImg by (w * h) to normalize
Phase Correlation
Test Image Padded Template Match Scores
Cross-Power Spectrum
𝑅 =
𝐹1 ◦ 𝐹2∗
𝐹1 ◦ 𝐹2∗
R Scores in freq. domain
F1 DFT of test image
F2 DFT of template image
* Complex conjugate
◦ Element-wise product
/ Element-wise division
| | Element-wise magnitude
__global__ void crossPowerSpectrum(
cufftComplex *R, const cufftComplex *F1,
const cufftComplex *F2, const int n)
{
int i= blockIdx.x * blockDim.x + threadIdx.x;
const cufftComplex a = F1[i];
const cufftComplex b_conj = cuConjf(F2[i]);
cufftComplex c = cuCmulf(a, b_conj);
const float norm
= rsqrtf(c.x * c.x + c.y * c.y);
c.x *= norm;
c.y *= norm;
R[i] = c;
}
Sliding Window
Low Compute-to-Read/Write Ratio
Current Frame Difference Cutout Color Dodge
© =
Nowhere to Hide From Latency
Read/Write Time
Co
mp
ute
Tim
e
Co
mp
ute
Tim
e
Read/Write Time
Co
mp
ute
Tim
e
Read/Write Time
Ideal SM Load Compute Heavy Read/Write Heavy
Eliminate Redundant Uses of Bandwidth
One-to-One
• Implement in a fragment shader
• Or, fuse all processing steps into 1 kernel
• Keep intermediate values in registers
• Use 128-byte coalesced reads
Many-to-One
• Identify input footprint overlap
• Gather footprints in sliding warp windows
• Use shuffles & votes for warp sharing
• Use shared memory for block sharing
Sliding Through Footprints
Tradeoff between:
• Warp compute depth
• Re-reading of footprint overlap
Make this tradeoff a template variable so you can test it.
Buffer re-used data in registers (1st) and shared memory (2nd).
Mean Shift // Pseudocode for (int x = slideStart + lane; x < slideWidth; x += 32) { // For the center footprint row: centerColor = inputImage[centerIndex]; colorSum = gaussianWeight(centerRow, centerCol) * centerColor; weightSum = gaussianWeight(centerRow, centerCol); // Shuffle to accumulate all footprint columns in center row... // For each other footprint row: { color = inputImage[colorIndex]; // If color is close enough to centerColor: { colorSum += gaussianWeight(row, centerCol) * color; weightSum += gaussianWeight(row, centerCol); } // Shuffle to accumulate footprint columns in this row... } meanShift[centerIndex] = colorSum / weightSum; // ELIDED: Remember previous window’s center column colors and use // to finish meanShift for lanes whose footprints overflow window. }
XY Separable
A Single Footprint of Separable Gaussian Blur
X Kernel Y Kernel
Is It Separable?
• Many Convolutions
• Local Statistics
• Local Histogram
Input Temp.
Data Output X Kernel Y Kernel
X Kernel: One Output from Many Inputs
• Buffer 128B input in 1 warp-read
• Slide warp by 1 output at a time
• Shuffle-accumulate to build outputs
• Write 128B output in 1 warp-write
• Loop to process more than 1 footprint of input
Warp Sliding Window
128B or Footprint Width
X Kernel: One Input to Many Outputs
• Buffer 128B input in 1 warp-read
• Slide warp by 1 input at a time
• Shuffle-broadcast to build outputs
• Write 128B output in 1 warp-write
• Loop to process more than 1 footprint of input
Warp Sliding Window
128B or Footprint Width
Intermediate Data
Convolution or Statistics
• Interleave pixel components
Histogram
• Separate component (bin) planes
Y Kernel
• Buffer 128B input in 1 warp-read
• Slide warp by 1 input row at a time
• Accumulate output in thread registers
• Write 128B output in 1 warp-write
• Loop to process more than 1 footprint of input
War
p S
lidin
g W
ind
ow
128B
Footprint
Height
Play Video
“I can do it.”
Questions?
Further Reading FFT: http://www.fmwconcepts.com/imagemagick/fourier_transforms/fourier.html
CUDA (Appendices): http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
Image Credits Mad Max: Fury Road (2015) (Fair Use)
Copyright: (https://www.123rf.com/profile_macropixel) macropixel / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_4zevar) 4zevar / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_donets) donets / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_mejn) mejn / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_yurchak) yurchak / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_kagenmi) kagenmi / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_kzlobastov) kzlobastov / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_lenm) lenm / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_cobalt) cobalt / 123RF Stock Photo (Royalty Free)
Copyright: (https://www.123rf.com/profile_kmiragaya) kmiragaya / 123RF Stock Photo (Royalty Free)
Hand Tools of the Trade
• Avoid divergence with MADDs: x += (predicate) * y;
• Have warp lane 0 perform shared memory atomics
• Have block thread (0,0,0) perform global atomics
• Useful with __ballot(): “population count” __popc(), “ find first set bit” __ffs()
• Write benchmarks and retest with new cards – balance points can change
• Read appendices to the CUDA Programming Guide – diagrams & secrets
• Write image files for input, output, and intermediate data to find bugs
• Test #pragma unroll before using it
• Use __forceinline__ and const T* __restrict__
• Use intrinsics and mathematical functions wherever possible
• Use the smallest data type possible