ybh1998 – Bohan Yu's Homepage

Blogs

LibreDR ARM Mali Porting

by ybh1998 2025-07-07

Recently, I’m porting my differentiable rendering engine LibreDR to ARM Mali platform. I’m using a FrienclyELEC NanoPC-T4 SBC, which contains an Mali-T860 GPU and OpenCL 1.2 support. Part of the clinfo output are:

Number of platforms 1
Platform Name ARM Platform
Platform Vendor ARM
Platform Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
Platform Profile FULL_PROFILE
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory

Number of platforms 1
Platform Name ARM Platform
Platform Vendor ARM
Platform Version OpenCL 1.2 v1.r14p0-01rel0-git(966ed26).f44c85cb3d2ceb87e8be88e7592755c3
Platform Profile FULL_PROFILE
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_byte_addressable_store cl_khr_3d_image_writes cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_fp16 cl_khr_gl_sharing cl_khr_icd cl_khr_egl_event cl_khr_egl_image cl_khr_image2d_from_buffer cl_arm_core_id cl_arm_printf cl_arm_thread_limit_hint cl_arm_non_uniform_work_group_size cl_arm_import_memory

After compiling and running the examples, the depth, material, gradient images all look good, except the rendered image shows totally black. The rendered image is gathered from all samples by an atomic_add_f function. Because cl_ext_float_atomics is not implemented by ARM (the same as NVIDIA and AMD), I’m using an implementation from StreamHPC. After a long debugging process, I found the minimum C code that produces wrong result on ARM OpenCL driver:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <CL/cl.h>

const char *kernel_source =
  "void atomic_add_f(volatile global float *source, const float operand) {\n"
  "  if (operand == 0.f)\n"
  "    return;\n"
  "  union {\n"
  "    uint u32;\n"
  "    float f32;\n"
  "  } next, expected, current;\n"
  "  current.f32 = *source;\n"
  "  do {\n"
  "    expected.f32 = current.f32;\n"
  "    next.f32 = expected.f32 + operand;\n"
  "    current.u32 = atomic_cmpxchg((volatile __global uint *)source, expected.u32, next.u32);\n"
  "  } while (current.u32 != expected.u32);\n"
  "}\n"
  "\n"
  "void kernel bug_test(volatile global float a[512][128]) {\n"
  "  atomic_add_f(&a[255][get_global_id(0)], 1.0f);\n"
  "  atomic_add_f(&a[256][get_global_id(0)], 1.0f);\n"
  "}\n";

int main() {
  printf("%s", kernel_source);
  cl_int CL_err = CL_SUCCESS;
  cl_device_id device_id;
  CL_err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
  assert(CL_err == CL_SUCCESS);
  cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, NULL);
  assert(context != NULL);
  cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, NULL);
  assert(commands != NULL);
  cl_program program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, NULL);
  assert(program != NULL);
  CL_err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
  if (!CL_err == CL_SUCCESS) {
    size_t logsize;
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logsize);
    assert(CL_err == CL_SUCCESS);
    char *buffer = (char *)malloc(logsize * sizeof(char));
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logsize, buffer, NULL);
    assert(CL_err == CL_SUCCESS);
    printf("log:\n%s\n", buffer);
    return -1;
  }
  cl_kernel kernel = clCreateKernel(program, "bug_test", &CL_err);
  printf("clCreateKernel, %d\n", CL_err);
  assert(kernel != NULL);
  cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, 512 * 128 * sizeof(cl_float), NULL, NULL);
  assert(mem != NULL);
  cl_float pattern = 0.f;
  CL_err = clEnqueueFillBuffer(commands, mem, &pattern, sizeof(cl_float), 0, 512 * 128 * sizeof(cl_float), 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  CL_err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem);
  assert(CL_err == CL_SUCCESS);
  size_t global_work_size[1] = { 128 };
  size_t local_work_size[1] = { 1 };
  CL_err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  cl_float host_buffer[512 * 128];
  CL_err = clEnqueueReadBuffer(commands, mem, CL_TRUE, 0, 512 * 128 * sizeof(cl_float), host_buffer, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  for (int i = 255 * 128; i < 257 * 128; ++i) {
    printf("%.0f%c", host_buffer[i], i == 9 ? '\n' : ' ');
  }
  printf("\n");
  return 0;
}

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <CL/cl.h>

const char *kernel_source =
  "void atomic_add_f(volatile global float *source, const float operand) {\n"
  "  if (operand == 0.f)\n"
  "    return;\n"
  "  union {\n"
  "    uint u32;\n"
  "    float f32;\n"
  "  } next, expected, current;\n"
  "  current.f32 = *source;\n"
  "  do {\n"
  "    expected.f32 = current.f32;\n"
  "    next.f32 = expected.f32 + operand;\n"
  "    current.u32 = atomic_cmpxchg((volatile __global uint *)source, expected.u32, next.u32);\n"
  "  } while (current.u32 != expected.u32);\n"
  "}\n"
  "\n"
  "void kernel bug_test(volatile global float a[512][128]) {\n"
  "  atomic_add_f(&a[255][get_global_id(0)], 1.0f);\n"
  "  atomic_add_f(&a[256][get_global_id(0)], 1.0f);\n"
  "}\n";

int main() {
  printf("%s", kernel_source);
  cl_int CL_err = CL_SUCCESS;
  cl_device_id device_id;
  CL_err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
  assert(CL_err == CL_SUCCESS);
  cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, NULL);
  assert(context != NULL);
  cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, NULL);
  assert(commands != NULL);
  cl_program program = clCreateProgramWithSource(context, 1, &kernel_source, NULL, NULL);
  assert(program != NULL);
  CL_err = clBuildProgram(program, 1, &device_id, "", NULL, NULL);
  if (!CL_err == CL_SUCCESS) {
    size_t logsize;
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logsize);
    assert(CL_err == CL_SUCCESS);
    char *buffer = (char *)malloc(logsize * sizeof(char));
    CL_err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logsize, buffer, NULL);
    assert(CL_err == CL_SUCCESS);
    printf("log:\n%s\n", buffer);
    return -1;
  }
  cl_kernel kernel = clCreateKernel(program, "bug_test", &CL_err);
  printf("clCreateKernel, %d\n", CL_err);
  assert(kernel != NULL);
  cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, 512 * 128 * sizeof(cl_float), NULL, NULL);
  assert(mem != NULL);
  cl_float pattern = 0.f;
  CL_err = clEnqueueFillBuffer(commands, mem, &pattern, sizeof(cl_float), 0, 512 * 128 * sizeof(cl_float), 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  CL_err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem);
  assert(CL_err == CL_SUCCESS);
  size_t global_work_size[1] = { 128 };
  size_t local_work_size[1] = { 1 };
  CL_err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  CL_err = clFinish(commands);
  assert(CL_err == CL_SUCCESS);
  cl_float host_buffer[512 * 128];
  CL_err = clEnqueueReadBuffer(commands, mem, CL_TRUE, 0, 512 * 128 * sizeof(cl_float), host_buffer, 0, NULL, NULL);
  assert(CL_err == CL_SUCCESS);
  for (int i = 255 * 128; i < 257 * 128; ++i) {
    printf("%.0f%c", host_buffer[i], i == 9 ? '\n' : ' ');
  }
  printf("\n");
  return 0;
}

Expand

In the kernel function bug_test, the kernel submits consecutive atomic_add_f jobs (random atomic_add_f jobs works fine). The kernel works fine when we perform atomic_add_f on address lower than 256 * 128 * sizeof(float). However, the program either returns CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, or produces wrong results (if CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE is enabled). I think It’s a bug in the ARM OpenCL implementation, which can hardly fixed because ARM OpenCL is proprietary and provides to OEM only.

In my case, this bug can be circumvented by gathering the results in order. As a result, LibreDR supports the ARM Mali platform starting from v0.0.21. However, this may not be possible in all circumstances. In the future, I’m willing to try Rusticl on Panfrost driver for a more open-source implementation.

Publications Computational Photography

EventPSR: Surface Normal and Reflectance Estimation from Photometric Stereo Using an Event Camera

by ybh1998 2025-05-07

Published in: The IEEE/CVF Conference on Computer Vision and Pattern Recognition 2025 (Highlight)

Abstract: Simultaneously acquisition of the surface normal and reflectance parameters is a crucial but challenging technique in the field of computer vision and graphics. It requires capturing multiple high dynamic range (HDR) images in existing methods using frame-based cameras. In this paper, we propose EventPSR, the first work to recover surface normal and reflectance parameters (e.g., metallic and roughness) simultaneously using an event camera. Compared with the existing methods based on photometric stereo or neural radiance fields, EventPSR is a robust and efficient approach that works consistently with different materials. Thanks to the extremely high temporal resolution and high dynamic range coverage of event cameras, EventPSR can recover accurate surface normal and reflectance of objects with various materials in 10 seconds. Extensive experiments on both synthetic data and real objects show that compared with existing methods using more than 100 HDR images, EventPSR recovers comparable surface normal and reflectance parameters with only about 30% of the data rate.

PDF Full Text

Video

Repo (Code & Data)

Publications Computational Photography

Active Hyperspectral Imaging Using an Event Camera

by ybh1998 2025-05-07

Published in: The IEEE/CVF Conference on Computer Vision and Pattern Recognition 2025 (Highlight)

Abstract: Hyperspectral imaging plays a critical role in numerous scientific and industrial fields. Conventional hyperspectral imaging systems often struggle with the trade-off between capture speed, spectral resolution, and bandwidth, particularly in dynamic environments. In this work, we present a novel event-based active hyperspectral imaging system designed for real-time capture with low bandwidth in dynamic scenes. By combining an event camera with a dynamic illumination strategy, our system achieves unprecedented temporal resolution while maintaining high spectral fidelity, all at a fraction of the bandwidth requirements of traditional systems. Unlike basis-based methods that sacrifice spectral resolution for efficiency, our approach enables continuous spectral sampling through an innovative “sweeping rainbow” illumination pattern synchronized with a rotating mirror array. The key insight is leveraging the sparse, asynchronous nature of event cameras to encode spectral variations as temporal contrasts, effectively transforming the spectral reconstruction problem into a series of geometric constraints. Extensive evaluations of both synthetic and real data demonstrate that our system outperforms state-of-the-art methods in temporal resolution while maintaining competitive spectral reconstruction quality.

PDF Full Text

Video

Publications Computational Photography

EventPS: Real-Time Photometric Stereo Using an Event Camera

by ybh1998 2024-05-27

Published in: The IEEE/CVF Conference on Computer Vision and Pattern Recognition 2024 (Oral, Best Paper Runner-Up)

Publications Computational Photography

MILO: Multi-bounce inverse rendering for indoor scene with light-emitting objects

by ybh1998 2023-03-29

Published in: IEEE Transactions on Pattern Analysis and Machine Intelligence (Volume: 45, Issue: 8, August 2023)

Projects Computer Networks

BiscuitMesh

by ybh1998 2021-10-04

BiscuitMesh is an overlay network configuration tool based on babeld, tinc, and v2ray.
Find more on https://codeberg.org/ybh1998/BiscuitMesh

Publications Wireless Sensing

WiFi-Sleep: Sleep Stage Monitoring Using Commodity Wi-Fi Devices

by ybh1998 2021-09-15

Published in: IEEE Internet of Things Journal ( Volume: 8, Issue: 18, Sept.15, 15 2021)