performance - unnecessary CVT instructions in PTX binary generated from OpenCL -


i have written simple opencl code, , tried execute on tesla k40m gpu , measure gflops. here code i've written:

__kernel void test(__global float *gin, __global float *gout, int m, int n, int p) {     int x = get_global_id(0);     // private variable     float temp = 1.0;       // start of new level of loop     int baseindex1 = (x) * 512;     temp += gin[baseindex1 + 0] * var;     temp += gin[baseindex1 + 1] * var;     temp += gin[baseindex1 + 2] * var;     temp += gin[baseindex1 + 3] * var;     temp += gin[baseindex1 + 4] * var;     temp += gin[baseindex1 + 5] * var;     temp += gin[baseindex1 + 6] * var;     temp += gin[baseindex1 + 7] * var;     temp += gin[baseindex1 + 8] * var;     temp += gin[baseindex1 + 9] * var;     temp += gin[baseindex1 + 10] * var;     ...     temp += gin[baseindex1 + 510] * var;     temp += gin[baseindex1 + 511] * var;     gout[baseindex1] = temp; } 

i have deployed kernel on gpu global_work_size of [1048576] , local_work_size of [128]. total number of floating point operations can performance per second around 1.6 gflops, extremely low. assume i'm doing single operations , memory being read sequentially. i've decided take @ generated ptx code:

.version 5.0 .target sm_35, texmode_independent .address_size 64      // .globl   test .func  (.param .b64 func_retval0) get_global_id (     .param .b32 get_global_id_param_0 ) ;  .entry test(     .param .u64 .ptr .global .align 4 test_param_0,     .param .u64 .ptr .global .align 4 test_param_1,     .param .u32 test_param_2,     .param .u32 test_param_3,     .param .u32 test_param_4 ) {     .reg .f32   %f<1537>;     .reg .b32   %r<515>;     .reg .b64   %rd<1543>;       ld.param.u64    %rd1, [test_param_0];     ld.param.u64    %rd2, [test_param_1];     mov.u32     %r1, 0;     // callseq start 0     {     .reg .b32 temp_param_reg;     // <end>}     .param .b32 param0;     st.param.b32    [param0+0], %r1;     .param .b64 retval0;     call.uni (retval0),      get_global_id,      (     param0     );     ld.param.b64    %rd3, [retval0+0];      //{     }// callseq end 0     cvt.u32.u64 %r2, %rd3;     mul.lo.s32  %r3, %r2, 512;     cvt.s64.s32 %rd4, %r3;     shl.b64     %rd5, %rd4, 2;     add.s64     %rd6, %rd1, %rd5;     ld.global.f32   %f1, [%rd6];     mul.f32     %f2, %f1, 0f3fc00000;     add.f32     %f3, %f2, 0f3f800000;     add.s32     %r4, %r3, 1;     cvt.s64.s32 %rd7, %r4;     shl.b64     %rd8, %rd7, 2;     add.s64     %rd9, %rd1, %rd8;     ld.global.f32   %f4, [%rd9];     mul.f32     %f5, %f4, 0f3fc00000;     add.f32     %f6, %f3, %f5;     add.s32     %r5, %r3, 2;     cvt.s64.s32 %rd10, %r5;     shl.b64     %rd11, %rd10, 2;     add.s64     %rd12, %rd1, %rd11;     ld.global.f32   %f7, [%rd12];     mul.f32     %f8, %f7, 0f3fc00000;     add.f32     %f9, %f6, %f8;     add.s32     %r6, %r3, 3;     cvt.s64.s32 %rd13, %r6;     shl.b64     %rd14, %rd13, 2;     add.s64     %rd15, %rd1, %rd14;     ld.global.f32   %f10, [%rd15];     mul.f32     %f11, %f10, 0f3fc00000;     add.f32     %f12, %f9, %f11;     add.s32     %r7, %r3, 4;     cvt.s64.s32 %rd16, %r7;     shl.b64     %rd17, %rd16, 2;     add.s64     %rd18, %rd1, %rd17;     ld.global.f32   %f13, [%rd18];     mul.f32     %f14, %f13, 0f3fc00000;     add.f32     %f15, %f12, %f14;     add.s32     %r8, %r3, 5;     cvt.s64.s32 %rd19, %r8;     shl.b64     %rd20, %rd19, 2;     add.s64     %rd21, %rd1, %rd20;     ld.global.f32   %f16, [%rd21];     mul.f32     %f17, %f16, 0f3fc00000;     add.f32     %f18, %f15, %f17; 

as it's clear inside code, have unnecessary cvt , shl instructions, assume 1 potential cause of overhead.

now have 2 questions here: (1) how should rewrite kernel rid of 2 mentioned instructions , make kernel performs faster? (2) there other source of overhead in code, i'm unaware of?

İf var double type, convert instruction source float cant add directly .

using same temp adding pipeline stopper.

accessing array stride of 512 floats use 1 memory channel , 1 memory bank @ time. may serialize memory operations on top of serialized instructions per thread.

do reduction between far items, not neighbors , pairs or maybe 4 items per thread solve memory problem.

use multiple temps pipeline problem.

put f postfix floats if not meant doubles. try evade adding double , float repeatedly.

using different memory channel per thread good.

letting compiler/hardware rename registers good.

adding less number of values on same register means less probability round off error being bigger added value good.

shifting seems adress calculation float length of 4. shifting left 2 adr. maybe buffer not aligned? compute base index plus pointer add other values instead of recalculating base , additions @ each line makes slow. maybe gin parameter needs restrict or const keyword before entering automatic optimization considerations.


Comments

Popular posts from this blog

python Tkinter Capturing keyboard events save as one single string -

android - InAppBilling registering BroadcastReceiver in AndroidManifest -

javascript - Z-index in d3.js -