i use fp16 on “PowerVR GE8300”, after investigating, i find the bottleneck is memory bandwidth.
However, the performance of fp16 and fp32 OpenCL kernel is almost same.Since the size of fp16 is half of fp32, so fp16 should have better performance, but the result seems not. Could you help check it, does i use the wrong API read/write memory?
Both fp16/f32 have same input/output (except data type) and kernel architect
int input_size = 1 * 112 * 112 * 16; //input0
int weight_size = 96 * 1 * 1 * 16; //input1
int output_size = 1 * 112 * 112 * 96; //tensor
int bias_size = 96; //input2
size_t local_work_size[] = {8,8,2};
size_t global_work_size[] = {112,112,6};
The fp32 kerenl code is:
_attribute__((reqd_work_group_size(8,8,2)))
__kernel void fuse_conv2d_clip_37_kernel0(__global float* restrict input0, __global float* restrict input1, __global float* restrict tensor, __global float* restrict input2) {
float compute[16];
__local float pad_temp_shared[1024];
float pad_temp_shared_local[16];
float input1_shared_local[256];
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer_init = 0; nn_ff_fused_yy_fused_xx_fused_outer_init < 8; ++nn_ff_fused_yy_fused_xx_fused_outer_init) {
compute[(nn_ff_fused_yy_fused_xx_fused_outer_init * 2)] = 0.000000e+00f;
compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 2))] = 0.000000e+00f;
}
vstore8(vload8(0, input0 + (((((((int)get_group_id(1)) * 896) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 100352)) + (((int)get_local_id(1)) * 12544)) + (((int)get_local_id(0)) * 112))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 512) + (((int)get_local_id(1)) * 64)) + (((int)get_local_id(0)) * 8)));
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int ax1 = 0; ax1 < 16; ++ax1) {
pad_temp_shared_local[ax1] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (ax1 * 64))];
}
barrier(CLK_LOCAL_MEM_FENCE);
vstore4(vload4(0, input1 + ((((((int)get_group_id(2)) * 512) + (((int)get_local_id(2)) * 256)) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 256) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4)));
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int ax0 = 0; ax0 < 16; ++ax0) {
#pragma unroll
for (int ax11 = 0; ax11 < 16; ++ax11) {
input1_shared_local[((ax0 * 16) + ax11)] = pad_temp_shared[(((((int)get_local_id(2)) * 256) + (ax0 * 16)) + ax11)];
}
}
#pragma unroll
for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer = 0; nn_ff_fused_yy_fused_xx_fused_outer < 8; ++nn_ff_fused_yy_fused_xx_fused_outer) {
compute[(nn_ff_fused_yy_fused_xx_fused_outer * 2)] = (compute[(nn_ff_fused_yy_fused_xx_fused_outer * 2)] + (pad_temp_shared_local[rc_inner] * input1_shared_local[(rc_inner + (nn_ff_fused_yy_fused_xx_fused_outer * 32))]));
compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer * 2))] = (compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer * 2))] + (pad_temp_shared_local[rc_inner] * input1_shared_local[((16 + rc_inner) + (nn_ff_fused_yy_fused_xx_fused_outer * 32))]));
}
}
#pragma unroll
for (int ax1_inner_inner = 0; ax1_inner_inner < 16; ++ax1_inner_inner) {
tensor[(((((((((int)get_group_id(2)) * 401408) + (((int)get_group_id(1)) * 896)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 200704)) + (((int)get_local_id(1)) * 112)) + ((int)get_local_id(0))) + (ax1_inner_inner * 12544))] = max(min((compute[ax1_inner_inner] + input2[(((((int)get_group_id(2)) * 32) + (((int)get_local_id(2)) * 16)) + ax1_inner_inner)]), 6.000000e+00f), 0.000000e+00f);
}
}
The fp16 kernel code is:
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__attribute__((reqd_work_group_size(8,8,2)))
__kernel void fuse_conv2d_clip_37_kernel0(__global half* restrict input0, __global half* restrict input1, __global half* restrict tensor, __global half* restrict input2) {
half compute[16];
__local half pad_temp_shared[1024];
half pad_temp_shared_local[16];
half input1_shared_local[256];
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer_init = 0; nn_ff_fused_yy_fused_xx_fused_outer_init < 4; ++nn_ff_fused_yy_fused_xx_fused_outer_init) {
compute[(nn_ff_fused_yy_fused_xx_fused_outer_init * 4)] = (half)0.000000e+00f;
compute[(1 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 4))] = (half)0.000000e+00f;
compute[(2 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 4))] = (half)0.000000e+00f;
compute[(3 + (nn_ff_fused_yy_fused_xx_fused_outer_init * 4))] = (half)0.000000e+00f;
}
vstore8(vload8(0, input0 + (((((((int)get_group_id(1)) * 896) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 100352)) + (((int)get_local_id(1)) * 12544)) + (((int)get_local_id(0)) * 112))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 512) + (((int)get_local_id(1)) * 64)) + (((int)get_local_id(0)) * 8)));
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int ax1 = 0; ax1 < 16; ++ax1) {
pad_temp_shared_local[ax1] = pad_temp_shared[(((((int)get_local_id(1)) * 8) + ((int)get_local_id(0))) + (ax1 * 64))];
}
barrier(CLK_LOCAL_MEM_FENCE);
vstore4(vload4(0, input1 + ((((((int)get_group_id(2)) * 512) + (((int)get_local_id(2)) * 256)) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4))), 0, pad_temp_shared + (((((int)get_local_id(2)) * 256) + (((int)get_local_id(1)) * 32)) + (((int)get_local_id(0)) * 4)));
barrier(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for (int ax0 = 0; ax0 < 16; ++ax0) {
#pragma unroll
for (int ax11 = 0; ax11 < 16; ++ax11) {
input1_shared_local[((ax0 * 16) + ax11)] = pad_temp_shared[(((((int)get_local_id(2)) * 256) + (ax0 * 16)) + ax11)];
}
}
#pragma unroll
for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
#pragma unroll
for (int nn_ff_fused_yy_fused_xx_fused_outer = 0; nn_ff_fused_yy_fused_xx_fused_outer < 4; ++nn_ff_fused_yy_fused_xx_fused_outer) {
compute[(nn_ff_fused_yy_fused_xx_fused_outer * 4)] = (compute[(nn_ff_fused_yy_fused_xx_fused_outer * 4)] + (pad_temp_shared_local[rc_inner] * input1_shared_local[(rc_inner + (nn_ff_fused_yy_fused_xx_fused_outer * 64))]));
}
}
#pragma unroll
for (int ax1_inner_inner = 0; ax1_inner_inner < 16; ++ax1_inner_inner) {
tensor[(((((((((int)get_group_id(2)) * 401408) + (((int)get_group_id(1)) * 896)) + (((int)get_group_id(0)) * 8)) + (((int)get_local_id(2)) * 200704)) + (((int)get_local_id(1)) * 112)) + ((int)get_local_id(0))) + (ax1_inner_inner * 12544))] = max(min((compute[ax1_inner_inner] + input2[(((((int)get_group_id(2)) * 32) + (((int)get_local_id(2)) * 16)) + ax1_inner_inner)]), (half)6.000000e+00f), (half)0.000000e+00f);
}
}