Performance of data read/write between fp16 and fp32

i use fp16 on “PowerVR GE8300”, after investigating, i find the bottleneck is memory bandwidth.

However, the performance of fp16 and fp32 OpenCL kernel is almost same.Since the size of fp16 is half of fp32, so fp16 should have better performance, but the result seems not. Could you help check it, does i use the wrong API read/write memory?

Both fp16/f32 have same input/output (except data type) and kernel architect

     int input_size  = 1 * 112 * 112 * 16;      //input0
     int weight_size = 96 * 1 * 1 * 16;         //input1
     int output_size = 1 * 112 * 112 * 96;    //tensor
     int bias_size   = 96;                             //input2
     size_t local_work_size[] = {8,8,2};
     size_t global_work_size[] = {112,112,6};

The fp32 kerenl code is:

_attribute__((reqd_work_group_size(8,8,2)))
__kernel void fuse_conv2d_clip_37_kernel0(__global float* restrict input0, __global float* restrict input1, __global float* restrict tensor, __global float* restrict input2) {
   float compute[16];
  __local float pad_temp_shared[1024];
   float pad_temp_shared_local[16];
   float input1_shared_local[256];
  
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer_init = 0; nn_ff_fused_yy_fused_xx_fused_outer_init < 8; ++nn_ff_fused_yy_fused_xx_fused_outer_init) {
    compute[(nn_ff_fused_yy_fused_xx_fused_outer_init * 2)] = 0.000000e+00f;
    compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 2))] = 0.000000e+00f;
  }
  vstore8(vload8(0, input0 + (((((((int)get_group_id(1)) * 896) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 100352)) + (((int)get_local_id(1)) * 12544)) + (((int)get_local_id(0)) * 112))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 512) + (((int)get_local_id(1)) * 64)) + (((int)get_local_id(0)) * 8)));
  barrier(CLK_LOCAL_MEM_FENCE);
  
#pragma unroll
for (int ax1 = 0; ax1 < 16; ++ax1) {
    pad_temp_shared_local[ax1] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (ax1 * 64))];
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  vstore4(vload4(0, input1 + ((((((int)get_group_id(2)) * 512) + (((int)get_local_id(2)) * 256)) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 256) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4)));
  barrier(CLK_LOCAL_MEM_FENCE);
  
#pragma unroll
for (int ax0 = 0; ax0 < 16; ++ax0) {
    
#pragma unroll
for (int ax11 = 0; ax11 < 16; ++ax11) {
      input1_shared_local[((ax0 * 16) + ax11)] = pad_temp_shared[(((((int)get_local_id(2)) * 256) + (ax0 * 16)) + ax11)];
    }
  }
  
#pragma unroll
for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
    
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer = 0; nn_ff_fused_yy_fused_xx_fused_outer < 8; ++nn_ff_fused_yy_fused_xx_fused_outer) {
      compute[(nn_ff_fused_yy_fused_xx_fused_outer * 2)] = (compute[(nn_ff_fused_yy_fused_xx_fused_outer * 2)] + (pad_temp_shared_local[rc_inner] * input1_shared_local[(rc_inner + (nn_ff_fused_yy_fused_xx_fused_outer * 32))]));
      compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer * 2))] = (compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer * 2))] + (pad_temp_shared_local[rc_inner] * input1_shared_local[((16 + rc_inner) + (nn_ff_fused_yy_fused_xx_fused_outer * 32))]));
    }
  }
  
#pragma unroll
for (int ax1_inner_inner = 0; ax1_inner_inner < 16; ++ax1_inner_inner) {
    tensor[(((((((((int)get_group_id(2)) * 401408) + (((int)get_group_id(1)) * 896)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 200704)) + (((int)get_local_id(1)) * 112)) + ((int)get_local_id(0))) + (ax1_inner_inner * 12544))] = max(min((compute[ax1_inner_inner] + input2[(((((int)get_group_id(2)) * 32) + (((int)get_local_id(2)) * 16)) + ax1_inner_inner)]), 6.000000e+00f), 0.000000e+00f);
  }
}

The fp16 kernel code is:

#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__attribute__((reqd_work_group_size(8,8,2)))
__kernel void fuse_conv2d_clip_37_kernel0(__global half* restrict input0, __global half* restrict input1, __global half* restrict tensor, __global half* restrict input2) {
   half compute[16];
  __local half pad_temp_shared[1024];
   half pad_temp_shared_local[16];
   half input1_shared_local[256];
  
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer_init = 0; nn_ff_fused_yy_fused_xx_fused_outer_init < 4; ++nn_ff_fused_yy_fused_xx_fused_outer_init) {
    compute[(nn_ff_fused_yy_fused_xx_fused_outer_init * 4)] = (half)0.000000e+00f;
    compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 4))] = (half)0.000000e+00f;
    compute[(2 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 4))] = (half)0.000000e+00f;
    compute[(3 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 4))] = (half)0.000000e+00f;
  }
  vstore8(vload8(0, input0 + (((((((int)get_group_id(1)) * 896) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 100352)) + (((int)get_local_id(1)) * 12544)) + (((int)get_local_id(0)) * 112))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 512) + (((int)get_local_id(1)) * 64)) + (((int)get_local_id(0)) * 8)));
  barrier(CLK_LOCAL_MEM_FENCE);
  
#pragma unroll
for (int ax1 = 0; ax1 < 16; ++ax1) {
    pad_temp_shared_local[ax1] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (ax1 * 64))];
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  vstore4(vload4(0, input1 + ((((((int)get_group_id(2)) * 512) + (((int)get_local_id(2)) * 256)) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 256) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4)));
  barrier(CLK_LOCAL_MEM_FENCE);
  
#pragma unroll
for (int ax0 = 0; ax0 < 16; ++ax0) {
    
#pragma unroll
for (int ax11 = 0; ax11 < 16; ++ax11) {
      input1_shared_local[((ax0 * 16) + ax11)] = pad_temp_shared[(((((int)get_local_id(2)) * 256) + (ax0 * 16)) + ax11)];
    }
  }
  
#pragma unroll
for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
    
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer = 0; nn_ff_fused_yy_fused_xx_fused_outer < 4; ++nn_ff_fused_yy_fused_xx_fused_outer) {
      compute[(nn_ff_fused_yy_fused_xx_fused_outer * 4)] = (compute[(nn_ff_fused_yy_fused_xx_fused_outer * 4)] + (pad_temp_shared_local[rc_inner] * input1_shared_local[(rc_inner + (nn_ff_fused_yy_fused_xx_fused_outer * 64))]));
    }
  }
  
#pragma unroll
for (int ax1_inner_inner = 0; ax1_inner_inner < 16; ++ax1_inner_inner) {
    tensor[(((((((((int)get_group_id(2)) * 401408) + (((int)get_group_id(1)) * 896)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 200704)) + (((int)get_local_id(1)) * 112)) + ((int)get_local_id(0))) + (ax1_inner_inner * 12544))] = max(min((compute[ax1_inner_inner] + input2[(((((int)get_group_id(2)) * 32) + (((int)get_local_id(2)) * 16)) + ax1_inner_inner)]), (half)6.000000e+00f), (half)0.000000e+00f);
  }
}

Hi iuni,

Unfortunately due to the current circumstances I do not have access to an appropriate device to test your kernels. But the theory goes that using half floats will reduce register pressure and thus increase thread occupancy (if too much register space is required per instance, less instances can be processed in parallel) and will also reduce the kernel cycle count as 2 SOP or MAD instructions can be performed in a single cycle for half floats.

However if the kernel doesn’t have register pressure (100% occupancy) and isn’t ALU limited, then it might make sense not to see a performance boost.

However in your case you have a bandwidth bottleneck and you’ve changed the inputs and outputs to halves which should reduce the bandwidth. Maybe the stride between values is still 32 bits and the equivalent of full floats are still being read and written for some reason?

Your code looks correct though, I’ll try look further into this.

A quick side note: the wavefront on that GPU (and nearly all other PowerVR GPUs) is 32 so we’ve found that the optimal work-group size is also 32 (your kernel is 8x8x2=128 which might negatively impact performance).

And out of interest, do you have PVRTune recordings for the two kernels? Would be interesting to look at.

Cheers,
David

Hi David

After profiling the kernel, i find most of the time (>90%) consuming by following code:

#pragma unroll
for (int ax1_inner_inner = 0; ax1_inner_inner < 16; ++ax1_inner_inner) {
    tensor[(((((((((int)get_group_id(2)) * 401408) + (((int)get_group_id(1)) * 896)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 200704)) + (((int)get_local_id(1)) * 112)) + ((int)get_local_id(0))) + (ax1_inner_inner * 12544))] = max(min((compute[ax1_inner_inner] + input2[(((((int)get_group_id(2)) * 32) + (((int)get_local_id(2)) * 16)) + ax1_inner_inner)]), (half)6.000000e+00f), (half)0.000000e+00f);
  }

and more detailedly, reading input2 and writing tensor (global memory) occupied >90% of the time.

i don’t have opportunity to record pvrtune since i won’t optimize the kernel as my tasks were changed. Howerver, i share my finds here, it may help others.

Thanks!