Technical Inspector
See exactly how Sarek translates your high-level OCaml kernels into native GPU code across all major backends.
Vector Addition
OCaml (Sarek)
CUDA
OpenCL
Vulkan (GLSL)
Metal (MSL)
let%kernel vector_add (a : float32 vector) (b : float32 vector) (c : float32 vector) =
let idx = get_global_id 0 in
c.(idx) <- a.(idx) + b.(idx)
// Auto-generated by Sarek CUDA Backend
extern "C" __global__ void vector_add(
float* a, int sarek_a_len,
float* b, int sarek_b_len,
float* c, int sarek_c_len
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < sarek_a_len) {
c[idx] = a[idx] + b[idx];
}
}
// Auto-generated by Sarek OpenCL Backend
__kernel void vector_add(
__global float* a, int sarek_a_len,
__global float* b, int sarek_b_len,
__global float* c, int sarek_c_len
) {
int idx = get_global_id(0);
if (idx < sarek_a_len) {
c[idx] = a[idx] + b[idx];
}
}
// Auto-generated by Sarek Vulkan Backend
#version 450
layout(local_size_x = 256) in;
layout(std430, binding = 0) buffer buf_a { float a[]; };
layout(std430, binding = 1) buffer buf_b { float b[]; };
layout(std430, binding = 2) buffer buf_c { float c[]; };
void main() {
uint idx = gl_GlobalInvocationID.x;
c[idx] = a[idx] + b[idx];
}
// Auto-generated by Sarek Metal Backend
#include <metal_stdlib>
using namespace metal;
kernel void vector_add(
device float* a [[buffer(0)]],
device float* b [[buffer(1)]],
device float* c [[buffer(2)]],
uint idx [[thread_position_in_grid]]
) {
c[idx] = a[idx] + b[idx];
}
Shared Memory Reduction
OCaml (Sarek)
CUDA
OpenCL
Vulkan (GLSL)
Metal (MSL)
let%kernel reduce (input : float32 vector) (output : float32 vector) =
let%shared sdata = Array.create Float32 256 in
let tid = thread_idx_x in
let%superstep load =
sdata.(tid) <- input.(get_global_id 0)
in
(* Logic simplified for display *)
if tid < 128 then sdata.(tid) <- sdata.(tid) + sdata.(tid + 128);
barrier ();
if tid = 0 then output.(block_idx_x) <- sdata.(0)
extern "C" __global__ void reduce(float* input, float* output) {
__shared__ float sdata[256];
int tid = threadIdx.x;
int gid = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = input[gid];
__syncthreads();
if (tid < 128) {
sdata[tid] = sdata[tid] + sdata[tid + 128];
}
__syncthreads();
if (tid == 0) {
output[blockIdx.x] = sdata[0];
}
}
__kernel void reduce(__global float* input, __global float* output) {
__local float sdata[256];
int tid = get_local_id(0);
int gid = get_global_id(0);
sdata[tid] = input[gid];
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 128) {
sdata[tid] = sdata[tid] + sdata[tid + 128];
}
barrier(CLK_LOCAL_MEM_FENCE);
if (tid == 0) {
output[get_group_id(0)] = sdata[0];
}
}
#version 450
layout(local_size_x = 256) in;
layout(std430, binding = 0) buffer b_in { float input_data[]; };
layout(std430, binding = 1) buffer b_out { float output_data[]; };
shared float sdata[256];
void main() {
uint tid = gl_LocalInvocationID.x;
uint gid = gl_GlobalInvocationID.x;
sdata[tid] = input_data[gid];
barrier();
if (tid < 128) sdata[tid] += sdata[tid + 128];
barrier();
if (tid == 0) output_data[gl_WorkGroupID.x] = sdata[0];
}
kernel void reduce(
device float* input [[buffer(0)]],
device float* output [[buffer(1)]],
threadgroup float* sdata [[threadgroup(0)]],
uint tid [[thread_index_in_threadgroup]],
uint gid [[thread_position_in_grid]],
uint bid [[threadgroup_position_in_grid]]
) {
sdata[tid] = input[gid];
threadgroup_barrier(mem_flags::mem_threadgroup);
if (tid < 128) {
sdata[tid] = sdata[tid] + sdata[tid + 128];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (tid == 0) {
output[bid] = sdata[0];
}
}
Key Takeaways
- No Overhead: Sarek doesn’t add OCaml runtime logic to your kernels. The generated code is identical to what a human would write in C.
- Type Safety: Sarek handles the complex mapping of OCaml records to C structs and ensures array accesses are handled correctly.
- Unified Logic: You write the math once; Sarek handles the backend-specific boilerplate (
__global__,__kernel,get_global_id, etc.).