+ All Categories
Home > Documents > GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Date post: 01-Jan-2016
Category:
Upload: melanie-ross
View: 218 times
Download: 0 times
Share this document with a friend
Popular Tags:
33
GPU Program Optimization GPU Program Optimization Cliff Woolley Cliff Woolley University of Virginia / University of Virginia / NVIDIA NVIDIA
Transcript
Page 1: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

GPU Program OptimizationGPU Program Optimization

Cliff WoolleyCliff Woolley

University of Virginia / University of Virginia / NVIDIANVIDIA

Page 2: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

OverviewOverview

• Data Parallel ComputingData Parallel Computing

• Computational FrequencyComputational Frequency

• Profiling and Load BalancingProfiling and Load Balancing

Page 3: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Data Parallel ComputingData Parallel Computing

Page 4: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

• Instruction-Level ParallelismInstruction-Level Parallelism

• Data-Level ParallelismData-Level Parallelism

Data Parallel ComputingData Parallel Computing

Page 5: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

frag2frame Smooth(vert2frag IN, uniform samplerRECT Source : texunit0, uniform samplerRECT Operator : texunit1,frag2frame Smooth(vert2frag IN, uniform samplerRECT Source : texunit0, uniform samplerRECT Operator : texunit1, uniform samplerRECT Boundary : texunit2, uniform float4 params)uniform samplerRECT Boundary : texunit2, uniform float4 params){ { frag2frame OUT;frag2frame OUT;

float2 center = IN.TexCoord0.xy;float2 center = IN.TexCoord0.xy; float4 U = f4texRECT(Source, center);float4 U = f4texRECT(Source, center); // Calculate Red-Black (odd-even) masks// Calculate Red-Black (odd-even) masks float2 intpart;float2 intpart; float2 place = floor(1.0f - modf(round(center + float2(0.5f, 0.5f)) / 2.0f, intpart));float2 place = floor(1.0f - modf(round(center + float2(0.5f, 0.5f)) / 2.0f, intpart)); float2 mask = float2((1.0f-place.x) * (1.0f-place.y), place.x * place.y);float2 mask = float2((1.0f-place.x) * (1.0f-place.y), place.x * place.y); if (((mask.x + mask.y) && params.y) || (!(mask.x + mask.y) && !params.y))if (((mask.x + mask.y) && params.y) || (!(mask.x + mask.y) && !params.y)) {{ float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f), params.x*center.y - 0.5f*(params.x-1.0f));float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f), params.x*center.y - 0.5f*(params.x-1.0f)); ...... float4 neighbor = float4(center.x - 1.0f, center.x + 1.0f, center.y - 1.0f, center.y + 1.0f);float4 neighbor = float4(center.x - 1.0f, center.x + 1.0f, center.y - 1.0f, center.y + 1.0f); float central = -2.0f*(O.x + O.y);float central = -2.0f*(O.x + O.y); float poisson = ((params.x*params.x)*U.z + (-O.x * f1texRECT(Source, float2(neighbor.x, center.y)) +float poisson = ((params.x*params.x)*U.z + (-O.x * f1texRECT(Source, float2(neighbor.x, center.y)) + -O.x * f1texRECT(Source, float2(neighbor.y, center.y)) +-O.x * f1texRECT(Source, float2(neighbor.y, center.y)) + -O.y * f1texRECT(Source, float2(center.x, neighbor.z)) +-O.y * f1texRECT(Source, float2(center.x, neighbor.z)) + -O.z * f1texRECT(Source, float2(center.x, neighbor.w)))) / O.w; -O.z * f1texRECT(Source, float2(center.x, neighbor.w)))) / O.w; OUT.COL.x = poisson;OUT.COL.x = poisson; }} ...... return OUT;return OUT;}}

A really naïve shaderA really naïve shader

Page 6: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

frag2frame Smooth(vert2frag IN, uniform samplerRECT Source : texunit0, uniform samplerRECT Operator : texunit1,frag2frame Smooth(vert2frag IN, uniform samplerRECT Source : texunit0, uniform samplerRECT Operator : texunit1, uniform samplerRECT Boundary : texunit2, uniform float4 params)uniform samplerRECT Boundary : texunit2, uniform float4 params){ { frag2frame OUT;frag2frame OUT;

float2 center = IN.TexCoord0.xy;float2 center = IN.TexCoord0.xy; float4 U = f4texRECT(Source, center);float4 U = f4texRECT(Source, center); // Calculate Red-Black (odd-even) masks// Calculate Red-Black (odd-even) masks float2 intpart;float2 intpart; float2 place = floor(1.0f - modf(round(center + float2(0.5f, 0.5f)) / 2.0f, intpart));float2 place = floor(1.0f - modf(round(center + float2(0.5f, 0.5f)) / 2.0f, intpart)); float2 mask = float2((1.0f-place.x) * (1.0f-place.y), place.x * place.y);float2 mask = float2((1.0f-place.x) * (1.0f-place.y), place.x * place.y); if (((mask.x + mask.y) && params.y) || (!(mask.x + mask.y) && !params.y))if (((mask.x + mask.y) && params.y) || (!(mask.x + mask.y) && !params.y)) {{ float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f), params.x*center.y - 0.5f*(params.x-1.0f));float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f), params.x*center.y - 0.5f*(params.x-1.0f)); ...... float4 neighbor = float4(center.x - 1.0f, center.x + 1.0f, center.y - 1.0f, center.y + 1.0f);float4 neighbor = float4(center.x - 1.0f, center.x + 1.0f, center.y - 1.0f, center.y + 1.0f); float central = -2.0f*(O.x + O.y);float central = -2.0f*(O.x + O.y); float poisson = ((params.x*params.x)*U.z + (-O.x * f1texRECT(Source, float2(neighbor.x, center.y)) +float poisson = ((params.x*params.x)*U.z + (-O.x * f1texRECT(Source, float2(neighbor.x, center.y)) + -O.x * f1texRECT(Source, float2(neighbor.y, center.y)) +-O.x * f1texRECT(Source, float2(neighbor.y, center.y)) + -O.y * f1texRECT(Source, float2(center.x, neighbor.z)) +-O.y * f1texRECT(Source, float2(center.x, neighbor.z)) + -O.z * f1texRECT(Source, float2(center.x, neighbor.w)))) / O.w; -O.z * f1texRECT(Source, float2(center.x, neighbor.w)))) / O.w; OUT.COL.x = poisson;OUT.COL.x = poisson; }} ...... return OUT;return OUT;}}

A really naïve shaderA really naïve shader

Page 7: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f),float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f), params.x*center.y - 0.5f*(params.x-1.0f)); params.x*center.y - 0.5f*(params.x-1.0f));

float4 neighbor = float4(center.x - 1.0f,float4 neighbor = float4(center.x - 1.0f, center.x + 1.0f, center.x + 1.0f, center.y - 1.0f, center.y - 1.0f, center.y + 1.0f); center.y + 1.0f);

Instruction-Level ParallelismInstruction-Level Parallelism

Page 8: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

float2 offset = center.xy - 0.5f;float2 offset = center.xy - 0.5f;offset = offset * params.xx + 0.5f; // MADR is cool too – oneoffset = offset * params.xx + 0.5f; // MADR is cool too – one

// cycle, two flops // cycle, two flops

float4 neighbor = center.xxyy + float4(-1.0f,1.0f,-1.0f,1.0f);float4 neighbor = center.xxyy + float4(-1.0f,1.0f,-1.0f,1.0f);

Instruction-Level ParallelismInstruction-Level Parallelism

Page 9: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Data-Level ParallelismData-Level Parallelism

• Pack scalar data into Pack scalar data into RGBA in texture memoryRGBA in texture memory

Page 10: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

Page 11: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• Think of your CPU program and your Think of your CPU program and your vertex and fragment programs as vertex and fragment programs as different levels of nested looping. different levels of nested looping.

...foreach tri in triangles { // run the vertex program on each vertex v1 = process_vertex(tri.vertex1); v2 = process_vertex(tri.vertex2); v3 = process_vertex(tri.vertex2);

// assemble the vertices into a triangle assembledtriangle = setup_tri(v1, v2, v3);

// rasterize the assembled triangle into [0..many] fragments fragments = rasterize(assembledtriangle);

// run the fragment program on each fragment foreach frag in fragments { outbuffer[frag.position] = process_fragment(frag); }}...

Page 12: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• BranchesBranches• Avoid these, especially in the inner loop – i.e., Avoid these, especially in the inner loop – i.e.,

the fragment program.the fragment program.

Page 13: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• Static branch resolutionStatic branch resolution• write several variants of each fragment write several variants of each fragment

program to handle boundary casesprogram to handle boundary cases

• eliminates conditionals in the fragment eliminates conditionals in the fragment programprogram

• equivalent to avoiding CPU inner-loop equivalent to avoiding CPU inner-loop branchingbranching

case 2: accounts for boundaries

case 1: no boundaries

Page 14: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• Dynamic branchingDynamic branching• Dynamic branching on NV4x and G70 hardware Dynamic branching on NV4x and G70 hardware

is better than “branching” with NV3xis better than “branching” with NV3x

•But still, there is a branch penaltyBut still, there is a branch penalty

•Good perf requires spatial coherence in Good perf requires spatial coherence in branchingbranching

Page 15: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• BranchesBranches• Ian Buck will talk more about various branching Ian Buck will talk more about various branching

techniques after lunchtechniques after lunch

Page 16: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• PrecomputePrecompute

• PrecomputePrecompute

• PrecomputePrecompute

Page 17: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• Precompute texture coordinatesPrecompute texture coordinates• Take advantage of under-utilized hardwareTake advantage of under-utilized hardware

•vertex processorvertex processor

•rasterizerrasterizer

• Reduce instruction count at the per-fragment Reduce instruction count at the per-fragment levellevel

• Avoid lookups being treated as texture Avoid lookups being treated as texture indirectionsindirections

Page 18: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

frag2frame Smooth(vert2frag IN, uniform samplerRECT Source : texunit0, uniform samplerRECT Operator : texunit1,frag2frame Smooth(vert2frag IN, uniform samplerRECT Source : texunit0, uniform samplerRECT Operator : texunit1, uniform samplerRECT Boundary : texunit2, uniform float4 params)uniform samplerRECT Boundary : texunit2, uniform float4 params){ { frag2frame OUT;frag2frame OUT;

float2 center = IN.TexCoord0.xy;float2 center = IN.TexCoord0.xy; float4 U = f4texRECT(Source, center);float4 U = f4texRECT(Source, center); // Calculate Red-Black (odd-even) masks// Calculate Red-Black (odd-even) masks float2 intpart;float2 intpart; float2 place = floor(1.0f - modf(round(center + float2(0.5f, 0.5f)) / 2.0f, intpart));float2 place = floor(1.0f - modf(round(center + float2(0.5f, 0.5f)) / 2.0f, intpart)); float2 mask = float2((1.0f-place.x) * (1.0f-place.y), place.x * place.y);float2 mask = float2((1.0f-place.x) * (1.0f-place.y), place.x * place.y); if (((mask.x + mask.y) && params.y) || (!(mask.x + mask.y) && !params.y))if (((mask.x + mask.y) && params.y) || (!(mask.x + mask.y) && !params.y)) {{ float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f), params.x*center.y - 0.5f*(params.x-1.0f));float2 offset = float2(params.x*center.x - 0.5f*(params.x-1.0f), params.x*center.y - 0.5f*(params.x-1.0f)); ...... float4 neighbor = float4(center.x - 1.0f, center.x + 1.0f, center.y - 1.0f, center.y + 1.0f);float4 neighbor = float4(center.x - 1.0f, center.x + 1.0f, center.y - 1.0f, center.y + 1.0f); float central = -2.0f*(O.x + O.y);float central = -2.0f*(O.x + O.y); float poisson = ((params.x*params.x)*U.z + (-O.x * f1texRECT(Source, float2(neighbor.x, center.y)) +float poisson = ((params.x*params.x)*U.z + (-O.x * f1texRECT(Source, float2(neighbor.x, center.y)) + -O.x * f1texRECT(Source, float2(neighbor.y, center.y)) +-O.x * f1texRECT(Source, float2(neighbor.y, center.y)) + -O.y * f1texRECT(Source, float2(center.x, neighbor.z)) +-O.y * f1texRECT(Source, float2(center.x, neighbor.z)) + -O.z * f1texRECT(Source, float2(center.x, neighbor.w)))) / O.w; -O.z * f1texRECT(Source, float2(center.x, neighbor.w)))) / O.w; OUT.COL.x = poisson;OUT.COL.x = poisson; }} ...... return OUT;return OUT;}}

Computational FrequencyComputational Frequency

• Precompute texture coordinatesPrecompute texture coordinates

Page 19: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

vert2frag smooth(app2vert IN, uniform float4x4 xform : C0,vert2frag smooth(app2vert IN, uniform float4x4 xform : C0, uniform float2 srcoffset, uniform float size)uniform float2 srcoffset, uniform float size){{ vert2frag OUT;vert2frag OUT;

OUT.position = mul(xform,IN.position);OUT.position = mul(xform,IN.position); OUT.center = IN.center;OUT.center = IN.center; OUT.redblack = IN.center - srcoffset;OUT.redblack = IN.center - srcoffset; OUT.operator = size*(OUT.redblack - 0.5f) + 0.5f;OUT.operator = size*(OUT.redblack - 0.5f) + 0.5f; OUT.hneighbor = IN.center.xxyx + float4(-1.0f, 1.0f, 0.0f, 0.0f);OUT.hneighbor = IN.center.xxyx + float4(-1.0f, 1.0f, 0.0f, 0.0f); OUT.vneighbor = IN.center.xyyy + float4(0.0f, -1.0f, 1.0f, 0.0f);OUT.vneighbor = IN.center.xyyy + float4(0.0f, -1.0f, 1.0f, 0.0f);

return OUT;return OUT;}}

Computational FrequencyComputational Frequency

• Precompute texture coordinatesPrecompute texture coordinates

Page 20: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational Frequency Computational Frequency

• Precomputing other valuesPrecomputing other values• Same deal! Factor other computations out:Same deal! Factor other computations out:

•Anything that varies linearly across the Anything that varies linearly across the geometrygeometry

•Anything that has a complex value computed Anything that has a complex value computed per-vertexper-vertex

•Anything that is uniform across the geometryAnything that is uniform across the geometry

Page 21: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• Precomputing on the CPUPrecomputing on the CPU• Use Use glMultiTexCoord4f()glMultiTexCoord4f() creatively creatively

• Extract as much uniformity from uniform Extract as much uniformity from uniform parameters as you canparameters as you can

Page 22: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

// Calculate Red-Black (odd-even) masks// Calculate Red-Black (odd-even) masksfloat2 intpart;float2 intpart;float2 place = floor(1.0f - modf(round(center + 0.5f) / 2.0f,float2 place = floor(1.0f - modf(round(center + 0.5f) / 2.0f, intpart));intpart));float2 mask = float2((1.0f-place.x) * (1.0f-place.y),float2 mask = float2((1.0f-place.x) * (1.0f-place.y), place.x * place.y);place.x * place.y);

if (((mask.x + mask.y) && params.y) ||if (((mask.x + mask.y) && params.y) || (!(mask.x + mask.y) && !params.y))(!(mask.x + mask.y) && !params.y)){{ ......

Computational FrequencyComputational Frequency

• Precomputed lookup tablesPrecomputed lookup tables

Page 23: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

half4 mask = f4texRECT(RedBlack, IN.redblack);half4 mask = f4texRECT(RedBlack, IN.redblack);/*/* * mask.x and mask.w tell whether IN.center.x and IN.center.y* mask.x and mask.w tell whether IN.center.x and IN.center.y * are both odd or both even, respectively. either of these two* are both odd or both even, respectively. either of these two * conditions indicates that the fragment is red. params.x==1* conditions indicates that the fragment is red. params.x==1 * selects red; params.y==1 selects black.* selects red; params.y==1 selects black. */*/if (dot(mask,params.xyyx))if (dot(mask,params.xyyx)){{ ......

Computational FrequencyComputational Frequency

• Precomputed lookup tablesPrecomputed lookup tables

Page 24: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Computational FrequencyComputational Frequency

• Precomputed lookup tablesPrecomputed lookup tables• Be careful with texture lookups – cache coherence is Be careful with texture lookups – cache coherence is

crucialcrucial

• Use the smallest data types you can get away with Use the smallest data types you can get away with to reduce bandwidth consumptionto reduce bandwidth consumption

• Use swizzles or writemasks on tex ops when Use swizzles or writemasks on tex ops when possiblepossible

• ““Computation is cheap; memory accesses are not.”Computation is cheap; memory accesses are not.”...if you’re memory access limited....if you’re memory access limited.

Page 25: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Profiling and Load BalancingProfiling and Load Balancing

Page 26: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Profiling and Load BalancingProfiling and Load Balancing

• Software profilingSoftware profiling

• GPU pipeline profilingGPU pipeline profiling

• GPU load balancingGPU load balancing

Page 27: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

• Run a standard software profiler!Run a standard software profiler!• Rational QuantifyRational Quantify

• Intel VTuneIntel VTune

• AMD CodeAnalystAMD CodeAnalyst

Profiling and Load BalancingProfiling and Load Balancing

Page 28: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

•GPU Pipeline ProfilingGPU Pipeline Profiling•This is where it gets tricky.This is where it gets tricky.

•Some tools exist to help you:Some tools exist to help you:•NVPerfKitNVPerfKit

NVIDIA exhibitor tech talk tomorrow morning at 10am in room NVIDIA exhibitor tech talk tomorrow morning at 10am in room 404A404A

•NVPerfHUDNVPerfHUDhttp://developer.nvidia.com/docs/IO/8343/How-To-Profile.pdfhttp://developer.nvidia.com/docs/IO/8343/How-To-Profile.pdf

•NVShaderPerfNVShaderPerfhttp://developer.nvidia.com/object/nvshaderperf_home.htmlhttp://developer.nvidia.com/object/nvshaderperf_home.html

•Apple OpenGL ProfilerApple OpenGL Profilerhttp://developer.apple.com/opengl/profiler_image.htmlhttp://developer.apple.com/opengl/profiler_image.html

Profiling and Load BalancingProfiling and Load Balancing

Page 29: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

• GPU Load BalancingGPU Load Balancing• This is a whole talk in and of itselfThis is a whole talk in and of itself

• e.g., httpe.g., http://developer.nvidia.com/docs/IO/8343/Performance-://developer.nvidia.com/docs/IO/8343/Performance-Optimisation.pdfOptimisation.pdf

• Be sure to read the NVIDIA GPU Programming Be sure to read the NVIDIA GPU Programming GuideGuide• http://developer.nvidia.com/object/gpu_programming_guide.htmlhttp://developer.nvidia.com/object/gpu_programming_guide.html

• Sometimes you can get more hints from third Sometimes you can get more hints from third parties than from the vendors themselvesparties than from the vendors themselves• http://www.3dcenter.de/artikel/cinefx/index6_e.phphttp://www.3dcenter.de/artikel/cinefx/index6_e.php

• http://www.3dcenter.de/artikel/nv40_technik/http://www.3dcenter.de/artikel/nv40_technik/

Profiling and Load BalancingProfiling and Load Balancing

Page 30: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

ConclusionsConclusions

Page 31: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

ConclusionsConclusions

• Get used to thinking in terms of Get used to thinking in terms of parallel computationparallel computation

• Understand how frequently each Understand how frequently each computation will run, and reduce computation will run, and reduce that frequency wherever possiblethat frequency wherever possible

• Track down bottlenecks in your Track down bottlenecks in your application, and shift work to other application, and shift work to other parts of the system that are idleparts of the system that are idle

Page 32: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

Questions?Questions?

• AcknowledgementsAcknowledgements• Pat Brown at NVIDIAPat Brown at NVIDIA

• NVIDIA for having given me a job this summerNVIDIA for having given me a job this summer

• Dave Luebke, my advisorDave Luebke, my advisor

• GPGPU course presentersGPGPU course presenters

Page 33: GPU Program Optimization Cliff Woolley University of Virginia / NVIDIA.

See AlsoSee Also

• GPU Gems II, Chapter 35GPU Gems II, Chapter 35


Recommended