Optimizer’s Toolbox - NVIDIA · 2017. 5. 1. · Optimizer’s Toolbox Fast CUDA Techniques For Real-time Image Processing Presenter: Sarah Kabala ([email protected])

Optimizer’s Toolbox Fast CUDA Techniques For Real-time Image Processing

Presenter: Sarah Kabala ([email protected])

“I think that’s impossible.”

Warp = 32 Threads

Registers (<= 255 * 32) W

R

Shared Memory W

R

L1 / Texture Cache R

L2 Cache

Main Memory W

R

W

R

Block = ? Warps

. . .

Shared Memory W

R

L1 / Texture Cache R

L2 Cache

Main Memory W

R

W

R

Memory

GP100

. . .

SM

Reg

SMem L1/Tex

SM

x255

64KB 24KB

SM

x8160

banks lines

L2 (4096KB)

Main Memory (16GB)

Latency

😄 R

😥 L1

😱 L2

RAM

__global__ // CUDA

void kernel_calcOtsuThreshold(int *thresholdPtr, const int *histogram, const int numPixels)

{

// Reference: https://en.wikipedia.org/wiki/Otsu%27s_Method

CUDA_ASSERT( blockDim.x == 32 ); // warp

CUDA_ASSERT( blockDim.x * blockDim.y == 256 ); // thresholds to try

CUDA_ASSERT( gridDim.x == 1 );

CUDA_ASSERT( gridDim.y == 1 );

// shared memory for block

__shared__ float cdf[256 + 1]; // table of cumulative probability

__shared__ float com[256 + 1]; // table of center of mass

__shared__ int maxVariance; // maximum inter-class variance

const int threshold = threadIdx.y * blockDim.x + threadIdx.x;

// 1. Accumulate table values

// set initial values

if (threshold == 0)

{

cdf[0] = 0.0f;

com[0] = 0.0f;

maxVariance = 0;

*thresholdPtr = 0;

}

// use only the first warp to avoid atomics and syncthread

if (threadIdx.y == 0)

{

const float normalize = 1.0f / numPixels;

float laneSum = 0.0f; // this thread lane's probability sum

float laneMean = 0.0f; // this thread lane's (center * probability) sum

for (int bin = threshold; bin < 256; bin += 32)

{

//float warpSum = 1.0f; // DEBUG: Internal test of CDF correctness

float warpSum = histogram[bin] * normalize; // warp sum of bin probabilities

CUDA_ASSERT(warpSum >= 0.0f);

float warpMean = bin * warpSum; // warp sum of bin (center * probability)

laneSum += warpSum;

laneMean += warpMean;

// prefix sum for this window of histogram

for (int laneMask = 0x01; laneMask < 32; laneMask <<= 1) // increasing powers of 2

{

float passSum = __shfl_xor(warpSum, laneMask);

float passMean = __shfl_xor(warpMean, laneMask);

// accumulate only a forward pass to this thread lane -- ! 50% divergence

if (threadIdx.x & laneMask)

{

laneSum += passSum;

laneMean += passMean;

}

// accumulate full warp reduction

warpSum += passSum;

warpMean += passMean;

}

CUDA_ASSERT(warpSum == __shfl(warpSum, 32 - 1));

CUDA_ASSERT(warpMean == __shfl(warpMean, 32 - 1));

// set lane values for this window of histogram

const int i = bin + 1;

CUDA_ASSERT(laneSum >= 0.0f);

cdf[i] = laneSum;

CUDA_ASSERT(laneMean >= 0.0f);

com[i] = laneMean;

//CUDA_ASSERT(laneSum == (float)i); // DEBUG: Internal test of CDF correctness

// update all lane values to the last lane's values before moving on to next hist window

CUDA_ASSERT(laneSum <= __shfl_down(laneSum, 1) + /*tolerance*/2e-3f); // increasing

CUDA_ASSERT(laneMean <= __shfl_down(laneMean, 1) + /*tolerance*/2e-3f); // increasing

laneSum = __shfl(laneSum, 32 - 1);

laneMean = __shfl(laneMean, 32 - 1);

}

// final laneSum should be 1.0 but might drift a little due to FP error

CUDA_ASSERT(abs(laneSum - 1.0f) < /*tolerance*/1e-3f);

}

__syncthreads(); // ---------------------------------------------------

// 2. Determine maximum inter-class variance

float variance_f = 0.0f; // inter-class variance

const float w0 = cdf[threshold];

const float w1 = max(0.0f, 1.0f - w0);

const float u0 = com[threshold] / w0; // ! possible divide-by-0

const float u1 = (com[256] - u0) / w1; // ! possible divide-by-0

if (w0 > 0.0f && w1 > 0.0f) // ! possible divergence

{

variance_f = max(0.0f, w0 * w1 * (u0 - u1) * (u0 - u1));

}

CUDA_ASSERT(variance_f >= 0.0f);

const int variance_i = __float_as_int(variance_f);

// shuffle reduce the warp maximum variance to call 8 atomics instead of 256 below

variance_f = max(__shfl_xor(variance_f, 0x01), variance_f);





if (threadIdx.x == 0)

{

// float-as-int comparisons are correct because all variances are positive

atomicMax(&maxVariance, __float_as_int(variance_f));

}

__syncthreads(); // ---------------------------------------------------

// 3. Determine maximum threshold value that has the maxVariance

if (variance_i == maxVariance)

{

atomicMax(thresholdPtr, threshold);

}

}

Flat Tires

Atomics

Shared memory bank conflicts

Uncoalesced L1 or L2 read/write

Reading the same cacheline more than once from L2

Writing the same cacheline more than once to L2

All main memory read/write

Thread divergence

__syncthreads()

Racing Stripes

Hardware filter with texfetch

Consume input cacheline with exactly 1 warp

Produce output cacheline with exactly 1 warp

Buffer re-used values in registers instead of re-reading

Accumulate/reduce/prefix-sum with warp shuffles

Use thread lane 0 for atomics

Call NPP for basic, unfused ops

Call cuFFT for convolution, correlation, high/low pass

Input Decimation

Free Bilerp

Warp Speed

__all

__any

__ballot

__shfl

__shfl_up

__shfl_down

__shfl_xor

Statistics

CUDA_ASSERT(__ballot(true) == 0xFFFFFFFF);

int sum = terms[lane], pass; sum += __shfl_xor(sum, 0x10); sum += __shfl_xor(sum, 0x08); sum += __shfl_xor(sum, 0x04); sum += __shfl_xor(sum, 0x02); sum += __shfl_xor(sum, 0x01); CUDA_ASSERT(sum == __shfl(sum, 0));

0 1 2 3 4 5 6 7

0 1 2 3 4 5 6 7

0 1 2 3 4 5 6 7

0 1 2 3 4 5 6 7

Prefix Sum


int sum = terms[lane], pass; int presum = terms[lane]; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));

0 1 2 3 4 5 6 7

0 1 2 3 4 5 6 7

0 1 2 3 4 5 6 7

0 1 2 3 4 5 6 7

Step Through: Sum & Prefix Sum

Lane 0 1 2 3 4 5 6 7

Values Seen

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 1 2 3 4 5 6 7 8

Prefix Sum 1 2 3 4 5 6 7 8


int sum = lane + 1 , pass; int presum = lane + 1; sum += pass = __shfl_xor(sum, 0x01); if (lane & 0x01) presum += pass; sum += pass = __shfl_xor(sum, 0x02); if (lane & 0x02) presum += pass; sum += pass = __shfl_xor(sum, 0x04); if (lane & 0x04) presum += pass; sum += pass = __shfl_xor(sum, 0x08); if (lane & 0x08) presum += pass; sum += pass = __shfl_xor(sum, 0x10); if (lane & 0x10) presum += pass; CUDA_ASSERT(sum == __shfl(sum, 0));


Lane 0 1 2 3 4 5 6 7

Values Seen

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 3 3 7 7 11 11 15 15

Prefix Sum 1 2 3 4 5 6 7 8




Lane 0 1 2 3 4 5 6 7

Values Seen

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 3 3 7 7 11 11 15 15

Prefix Sum 1 3 3 7 5 11 7 15




Lane 0 1 2 3 4 5 6 7

Values Seen

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 10 10 10 10 26 26 26 26

Prefix Sum 1 3 3 7 5 11 7 15




Lane 0 1 2 3 4 5 6 7

Values Seen

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 10 10 10 10 26 26 26 26

Prefix Sum 1 3 6 10 5 11 18 26




Lane 0 1 2 3 4 5 6 7

Values Seen

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 36 36 36 36 36 36 36 36

Prefix Sum 1 3 6 10 5 11 18 26




Lane 0 1 2 3 4 5 6 7

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 36 36 36 36 36 36 36 36

Prefix Sum 1 3 6 10 15 21 28 36




Lane 0 1 2 3 4 5 6 7

1 1 1 1 1 1 1 1

2 2 2 2 2 2 2 2

3 3 3 3 3 3 3 3

4 4 4 4 4 4 4 4

5 5 5 5 5 5 5 5

6 6 6 6 6 6 6 6

7 7 7 7 7 7 7 7

8 8 8 8 8 8 8 8

Sum 36 36 36 36 36 36 36 36

Prefix Sum 1 3 6 10 15 21 28 36



CDF from Histogram

int fullsum = 0, temp[256 / 32]; for (int i = lane; i < 256; i += 32) { int presum = histogram[i]; fullsum += prefixSum(presum); temp[i] = presum; } CUDA_ASSERT(fullsum > 0); for (int i = lane; i < 256; i += 32) { const int window = i >> 5; cdf[i] = temp[window] / (float)fullsum; }

The Frequency Domain

Spatial Domain Frequency Domain

Convolution Element-wise Multiplication

Deconvolution Element-wise Division

Correlation Element-wise Multiplication

Noise Reduction Low-pass Filter

Edge Detection High-pass Filter

Discrete Fourier Transform

Natural RGB Wave Magnitudes

DFT

N number of samples

xn nth pixel value

Xk kth frequency value

cuFFT for Images

FFT Inverse FFT

int fft(cufftComplex *freqImg,

cufftReal *lumImg,

const int w, const int h)

{

cufftHandle fwd;

cufftPlan2d(&fwd, h, w, CUFFT_R2C);

cufftExecR2C(fwd, lumImg, freqImg);

cufftDestroy(fwd);

return (w * (h / 2 + 1)); // = n

}

void ifft(cufftComplex *freqImg,

cufftReal *lumImg,

const int w, const int h)

{

cufftHandle inv;

cufftPlan2d(&inv, h, w, CUFFT_C2R);

cufftExecC2R(inv, fftImg, lumImg);

cufftDestroy(inv);

}

//Divide lumImg by (w * h) to normalize

Phase Correlation

Test Image Padded Template Match Scores

Cross-Power Spectrum

𝑅 =

𝐹1 ◦ 𝐹2∗

𝐹1 ◦ 𝐹2∗

R Scores in freq. domain

F1 DFT of test image

F2 DFT of template image

* Complex conjugate

◦ Element-wise product

/ Element-wise division

| | Element-wise magnitude

__global__ void crossPowerSpectrum(

cufftComplex *R, const cufftComplex *F1,

const cufftComplex *F2, const int n)

{

int i= blockIdx.x * blockDim.x + threadIdx.x;

const cufftComplex a = F1[i];

const cufftComplex b_conj = cuConjf(F2[i]);

cufftComplex c = cuCmulf(a, b_conj);

const float norm

= rsqrtf(c.x * c.x + c.y * c.y);

c.x *= norm;

c.y *= norm;

R[i] = c;

}

Sliding Window

Low Compute-to-Read/Write Ratio

Current Frame Difference Cutout Color Dodge

© =

Nowhere to Hide From Latency

Read/Write Time

Co

mp

ute

Tim

e

Co

mp

ute

Tim

e

Read/Write Time

Co

mp

ute

Tim

e

Read/Write Time

Ideal SM Load Compute Heavy Read/Write Heavy

Eliminate Redundant Uses of Bandwidth

One-to-One

• Implement in a fragment shader

• Or, fuse all processing steps into 1 kernel

• Keep intermediate values in registers

• Use 128-byte coalesced reads

Many-to-One

• Identify input footprint overlap

• Gather footprints in sliding warp windows

• Use shuffles & votes for warp sharing

• Use shared memory for block sharing

Sliding Through Footprints

Tradeoff between:

• Warp compute depth

• Re-reading of footprint overlap

Make this tradeoff a template variable so you can test it.

Buffer re-used data in registers (1st) and shared memory (2nd).

Mean Shift // Pseudocode for (int x = slideStart + lane; x < slideWidth; x += 32) { // For the center footprint row: centerColor = inputImage[centerIndex]; colorSum = gaussianWeight(centerRow, centerCol) * centerColor; weightSum = gaussianWeight(centerRow, centerCol); // Shuffle to accumulate all footprint columns in center row... // For each other footprint row: { color = inputImage[colorIndex]; // If color is close enough to centerColor: { colorSum += gaussianWeight(row, centerCol) * color; weightSum += gaussianWeight(row, centerCol); } // Shuffle to accumulate footprint columns in this row... } meanShift[centerIndex] = colorSum / weightSum; // ELIDED: Remember previous window’s center column colors and use // to finish meanShift for lanes whose footprints overflow window. }

XY Separable

A Single Footprint of Separable Gaussian Blur

X Kernel Y Kernel

Is It Separable?

• Many Convolutions

• Local Statistics

• Local Histogram

Input Temp.

Data Output X Kernel Y Kernel

X Kernel: One Output from Many Inputs

• Buffer 128B input in 1 warp-read

• Slide warp by 1 output at a time

• Shuffle-accumulate to build outputs

• Write 128B output in 1 warp-write

• Loop to process more than 1 footprint of input

Warp Sliding Window

128B or Footprint Width

X Kernel: One Input to Many Outputs


• Slide warp by 1 input at a time

• Shuffle-broadcast to build outputs



Warp Sliding Window

128B or Footprint Width

Intermediate Data

Convolution or Statistics

• Interleave pixel components

Histogram

• Separate component (bin) planes

Y Kernel


• Slide warp by 1 input row at a time

• Accumulate output in thread registers



War

p S

lidin

g W

ind

ow

128B

Footprint

Height

Play Video

“I can do it.”

Questions?

Further Reading FFT: http://www.fmwconcepts.com/imagemagick/fourier_transforms/fourier.html

CUDA (Appendices): http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html

Image Credits Mad Max: Fury Road (2015) (Fair Use)

Copyright: (https://www.123rf.com/profile_macropixel) macropixel / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_4zevar) 4zevar / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_donets) donets / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_mejn) mejn / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_yurchak) yurchak / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_kagenmi) kagenmi / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_kzlobastov) kzlobastov / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_lenm) lenm / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_cobalt) cobalt / 123RF Stock Photo (Royalty Free)

Copyright: (https://www.123rf.com/profile_kmiragaya) kmiragaya / 123RF Stock Photo (Royalty Free)

http://www.fmwconcepts.com/imagemagick/fourier_transforms/fourier.html



http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html








https://www.123rf.com/profile_macropixel



https://www.123rf.com/profile_4zevar



https://www.123rf.com/profile_donets



https://www.123rf.com/profile_mejn



https://www.123rf.com/profile_yurchak



https://www.123rf.com/profile_kagenmi



https://www.123rf.com/profile_kzlobastov



https://www.123rf.com/profile_lenm



https://www.123rf.com/profile_cobalt

https://www.123rf.com/profile_kmiragaya



Hand Tools of the Trade

• Avoid divergence with MADDs: x += (predicate) * y;

• Have warp lane 0 perform shared memory atomics

• Have block thread (0,0,0) perform global atomics

• Useful with __ballot(): “population count” __popc(), “ find first set bit” __ffs()

• Write benchmarks and retest with new cards – balance points can change

• Read appendices to the CUDA Programming Guide – diagrams & secrets

• Write image files for input, output, and intermediate data to find bugs

• Test #pragma unroll before using it

• Use __forceinline__ and const T* __restrict__

• Use intrinsics and mathematical functions wherever possible

• Use the smallest data type possible

Documents

Optimizer’s Toolbox - NVIDIA · 2017. 5. 1. · Optimizer’s Toolbox Fast CUDA Techniques For Real-time Image Processing Presenter: Sarah Kabala ([email protected])