Technical Inspector

See exactly how Sarek translates your high-level OCaml kernels into native GPU code across all major backends.

Vector Addition

OCaml (Sarek)
CUDA
OpenCL
Vulkan (GLSL)
Metal (MSL)
let%kernel vector_add (a : float32 vector) (b : float32 vector) (c : float32 vector) =
  let idx = get_global_id 0 in
  c.(idx) <- a.(idx) + b.(idx)
// Auto-generated by Sarek CUDA Backend
extern "C" __global__ void vector_add(
    float* a, int sarek_a_len,
    float* b, int sarek_b_len,
    float* c, int sarek_c_len
) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < sarek_a_len) {
        c[idx] = a[idx] + b[idx];
    }
}
// Auto-generated by Sarek OpenCL Backend
__kernel void vector_add(
    __global float* a, int sarek_a_len,
    __global float* b, int sarek_b_len,
    __global float* c, int sarek_c_len
) {
    int idx = get_global_id(0);
    if (idx < sarek_a_len) {
        c[idx] = a[idx] + b[idx];
    }
}
// Auto-generated by Sarek Vulkan Backend
#version 450
layout(local_size_x = 256) in;
layout(std430, binding = 0) buffer buf_a { float a[]; };
layout(std430, binding = 1) buffer buf_b { float b[]; };
layout(std430, binding = 2) buffer buf_c { float c[]; };

void main() {
    uint idx = gl_GlobalInvocationID.x;
    c[idx] = a[idx] + b[idx];
}
// Auto-generated by Sarek Metal Backend
#include <metal_stdlib>
using namespace metal;

kernel void vector_add(
    device float* a [[buffer(0)]],
    device float* b [[buffer(1)]],
    device float* c [[buffer(2)]],
    uint idx [[thread_position_in_grid]]
) {
    c[idx] = a[idx] + b[idx];
}

Shared Memory Reduction

OCaml (Sarek)
CUDA
OpenCL
Vulkan (GLSL)
Metal (MSL)
let%kernel reduce (input : float32 vector) (output : float32 vector) =
  let%shared sdata = Array.create Float32 256 in
  let tid = thread_idx_x in
  
  let%superstep load =
    sdata.(tid) <- input.(get_global_id 0)
  in
  
  (* Logic simplified for display *)
  if tid < 128 then sdata.(tid) <- sdata.(tid) + sdata.(tid + 128);
  barrier ();
  
  if tid = 0 then output.(block_idx_x) <- sdata.(0)
extern "C" __global__ void reduce(float* input, float* output) {
    __shared__ float sdata[256];
    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = input[gid];
    __syncthreads();

    if (tid < 128) {
        sdata[tid] = sdata[tid] + sdata[tid + 128];
    }
    __syncthreads();

    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}
__kernel void reduce(__global float* input, __global float* output) {
    __local float sdata[256];
    int tid = get_local_id(0);
    int gid = get_global_id(0);

    sdata[tid] = input[gid];
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid < 128) {
        sdata[tid] = sdata[tid] + sdata[tid + 128];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    if (tid == 0) {
        output[get_group_id(0)] = sdata[0];
    }
}
#version 450
layout(local_size_x = 256) in;
layout(std430, binding = 0) buffer b_in { float input_data[]; };
layout(std430, binding = 1) buffer b_out { float output_data[]; };
shared float sdata[256];

void main() {
    uint tid = gl_LocalInvocationID.x;
    uint gid = gl_GlobalInvocationID.x;
    sdata[tid] = input_data[gid];
    barrier();
    if (tid < 128) sdata[tid] += sdata[tid + 128];
    barrier();
    if (tid == 0) output_data[gl_WorkGroupID.x] = sdata[0];
}
kernel void reduce(
    device float* input [[buffer(0)]],
    device float* output [[buffer(1)]],
    threadgroup float* sdata [[threadgroup(0)]],
    uint tid [[thread_index_in_threadgroup]],
    uint gid [[thread_position_in_grid]],
    uint bid [[threadgroup_position_in_grid]]
) {
    sdata[tid] = input[gid];
    threadgroup_barrier(mem_flags::mem_threadgroup);

    if (tid < 128) {
        sdata[tid] = sdata[tid] + sdata[tid + 128];
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    if (tid == 0) {
        output[bid] = sdata[0];
    }
}

Key Takeaways