LibreDR ARM Mali Porting – Bohan Yu's Homepage

Recently, I’m porting my differentiable rendering engine LibreDR to ARM Mali platform. I’m using a FrienclyELEC NanoPC-T4 SBC, which contains an Mali-T860 GPU and OpenCL 1.2 support. Part of the clinfo output are:

Number of platforms 1
Platform Name ARM Platform
Platform Vendor ARM
Platform Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
Platform Profile FULL_PROFILE
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory

Number of platforms 1
Platform Name ARM Platform
Platform Vendor ARM
Platform Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
Platform Profile FULL_PROFILE
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory

After compiling and running the examples, the depth, material, gradient images all look good, except the rendered image shows totally black. The rendered image is gathered from all samples by an atomic_add_f function. Because cl_ext_float_atomics is not implemented by ARM (the same as NVIDIA and AMD), I’m using an implementation from StreamHPC. After a long debugging process, I found the minimum C code that produces wrong result on ARM OpenCL driver:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <CL/cl.h>

const char *kernel_source =
  "void atomic_add_f(volatile global float *source, const float operand) {\n"
  "  if (operand == 0.f)\n"
  "    return;\n"
  "  union {\n"
  "    uint u32;\n"
  "    float f32;\n"
  "  } next, expected, current;\n"
  "  current.f32 = *source;\n"
  "  do {\n"
  "    expected.f32 = current.f32;\n"
  "    next.f32 = expected.f32 + operand;\n"
  "    current.u32 = atomic_cmpxchg((volatile __global uint *)source, expected.u32, next.u32);\n"
  "  } while (current.u32 != expected.u32);\n"
  "}\n"
  "\n"
  "void kernel bug_test(volatile global float a[512][128]) {\n"
  "  atomic_add_f(&a[255][get_global_id(0)], 1.0f);\n"
  "  atomic_add_f(&a[256][get_global_id(0)], 1.0f);\n"
  "}\n";

int main() {
  printf("%s", kernel_source);
  cl_int CL_err = CL_SUCCESS;
  cl_device_id device_id;
  CL_err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
  assert(CL_err == CL_SUCCESS);
  cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, NULL);
  assert(context != NULL);
  cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, NULL);
  assert(commands != NULL);
  cl_program program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, NULL);
  assert(program != NULL);
  CL_err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
  if (!CL_err == CL_SUCCESS) {
    size_t logsize;
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logsize);
    assert(CL_err == CL_SUCCESS);
    char *buffer = (char *)malloc(logsize * sizeof(char));
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logsize, buffer, NULL);
    assert(CL_err == CL_SUCCESS);
    printf("log:\n%s\n", buffer);
    return -1;
  }
  cl_kernel kernel = clCreateKernel(program, "bug_test", &CL_err);
  printf("clCreateKernel, %d\n", CL_err);
  assert(kernel != NULL);
  cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, 512 * 128 * sizeof(cl_float), NULL, NULL);
  assert(mem != NULL);
  cl_float pattern = 0.f;
  CL_err = clEnqueueFillBuffer(commands, mem, &pattern, sizeof(cl_float), 0, 512 * 128 * sizeof(cl_float), 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  CL_err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem);
  assert(CL_err == CL_SUCCESS);
  size_t global_work_size[1] = { 128 };
  size_t local_work_size[1] = { 1 };
  CL_err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  cl_float host_buffer[512 * 128];
  CL_err = clEnqueueReadBuffer(commands, mem, CL_TRUE, 0, 512 * 128 * sizeof(cl_float), host_buffer, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  for (int i = 255 * 128; i < 257 * 128; ++i) {
    printf("%.0f%c", host_buffer[i], i == 9 ? '\n' : ' ');
  }
  printf("\n");
  return 0;
}

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <CL/cl.h>

const char *kernel_source =
  "void atomic_add_f(volatile global float *source, const float operand) {\n"
  "  if (operand == 0.f)\n"
  "    return;\n"
  "  union {\n"
  "    uint u32;\n"
  "    float f32;\n"
  "  } next, expected, current;\n"
  "  current.f32 = *source;\n"
  "  do {\n"
  "    expected.f32 = current.f32;\n"
  "    next.f32 = expected.f32 + operand;\n"
  "    current.u32 = atomic_cmpxchg((volatile __global uint *)source, expected.u32, next.u32);\n"
  "  } while (current.u32 != expected.u32);\n"
  "}\n"
  "\n"
  "void kernel bug_test(volatile global float a[512][128]) {\n"
  "  atomic_add_f(&a[255][get_global_id(0)], 1.0f);\n"
  "  atomic_add_f(&a[256][get_global_id(0)], 1.0f);\n"
  "}\n";

int main() {
  printf("%s", kernel_source);
  cl_int CL_err = CL_SUCCESS;
  cl_device_id device_id;
  CL_err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
  assert(CL_err == CL_SUCCESS);
  cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, NULL);
  assert(context != NULL);
  cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, NULL);
  assert(commands != NULL);
  cl_program program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, NULL);
  assert(program != NULL);
  CL_err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
  if (!CL_err == CL_SUCCESS) {
    size_t logsize;
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logsize);
    assert(CL_err == CL_SUCCESS);
    char *buffer = (char *)malloc(logsize * sizeof(char));
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logsize, buffer, NULL);
    assert(CL_err == CL_SUCCESS);
    printf("log:\n%s\n", buffer);
    return -1;
  }
  cl_kernel kernel = clCreateKernel(program, "bug_test", &CL_err);
  printf("clCreateKernel, %d\n", CL_err);
  assert(kernel != NULL);
  cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, 512 * 128 * sizeof(cl_float), NULL, NULL);
  assert(mem != NULL);
  cl_float pattern = 0.f;
  CL_err = clEnqueueFillBuffer(commands, mem, &pattern, sizeof(cl_float), 0, 512 * 128 * sizeof(cl_float), 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  CL_err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem);
  assert(CL_err == CL_SUCCESS);
  size_t global_work_size[1] = { 128 };
  size_t local_work_size[1] = { 1 };
  CL_err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  cl_float host_buffer[512 * 128];
  CL_err = clEnqueueReadBuffer(commands, mem, CL_TRUE, 0, 512 * 128 * sizeof(cl_float), host_buffer, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  for (int i = 255 * 128; i < 257 * 128; ++i) {
    printf("%.0f%c", host_buffer[i], i == 9 ? '\n' : ' ');
  }
  printf("\n");
  return 0;
}

Expand

In the kernel function bug_test, the kernel submits consecutive atomic_add_f jobs (random atomic_add_f jobs works fine). The kernel works fine when we perform atomic_add_f on address lower than 256 * 128 * sizeof(float). However, the program either returns CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, or produces wrong results (if CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE is enabled). I think It’s a bug in the ARM OpenCL implementation, which can hardly fixed because ARM OpenCL is proprietary and provides to OEM only.

In my case, this bug can be circumvented by gathering the results in order. However, this may not be possible in all circumstances. In the future, I’m willing to try Rusticl on Panfrost driver for a more open-source implementation.