pub const GpuBackend = union(enum) {
cuda: CudaBackend,
rocm: RocmBackend,
vulkan: VulkanBackend,
metal: MetalBackend,
opencl: OpenClBackend,
cpu: CpuBackend,
// Unified interface
pub fn allocate(self: *GpuBackend, size: usize) !DevicePtr {
return switch (self.*) {
.cuda => |*b| b.cudaMalloc(size),
.rocm => |*b| b.hipMalloc(size),
.vulkan => |*b| b.vkAllocateMemory(size),
.metal => |*b| b.newBuffer(size),
.opencl => |*b| b.clCreateBuffer(size),
.cpu => |*b| b.aligned_alloc(size),
};
}
pub fn launch(
self: *GpuBackend,
kernel: Kernel,
grid: Grid,
args: []const *anyopaque,
) !void {
return switch (self.*) {
.cuda => |*b| b.cudaLaunchKernel(kernel, grid, args),
.rocm => |*b| b.hipLaunchKernel(kernel, grid, args),
.vulkan => |*b| b.vkCmdDispatch(kernel, grid),
.metal => |*b| b.dispatchThreads(kernel, grid),
.opencl => |*b| b.clEnqueueNDRangeKernel(kernel, grid),
.cpu => |*b| b.parallel_for(kernel, grid),
};
}
};
pub const DeviceDetector = struct {
pub fn detectDevices() ![]Device {
var devices = ArrayList(Device).init();
// Try CUDA
if (cuda.isAvailable()) {
const count = try cuda.deviceGetCount();
for (0..count) |i| {
const props = try cuda.getDeviceProperties(i);
try devices.append(.{
.backend = .cuda,
.id = i,
.name = props.name,
.memory = props.totalGlobalMem,
.compute_capability = props.major * 10 + props.minor,
});
}
}
// Try ROCm
if (rocm.isAvailable()) {
const count = try rocm.hipGetDeviceCount();
for (0..count) |i| {
const props = try rocm.hipGetDeviceProperties(i);
try devices.append(.{
.backend = .rocm,
.id = i,
.name = props.name,
.memory = props.totalGlobalMem,
.compute_capability = props.major * 10 + props.minor,
});
}
}
// Try Vulkan
if (vulkan.isAvailable()) {
const physical_devices = try vulkan.enumeratePhysicalDevices();
for (physical_devices) |pd| {
const props = try vulkan.getPhysicalDeviceProperties(pd);
if (props.deviceType == .discrete_gpu or
props.deviceType == .integrated_gpu) {
try devices.append(.{
.backend = .vulkan,
.id = pd.handle,
.name = props.deviceName,
.memory = props.memorySize,
});
}
}
}
// Fallback to CPU
if (devices.items.len == 0) {
try devices.append(.{
.backend = .cpu,
.id = 0,
.name = "CPU",
.memory = getSystemMemory(),
});
}
return devices.toOwnedSlice();
}
};
pub fn generateKernel(comptime spec: KernelSpec) type {
return struct {
// Generate backend-specific code
pub const cuda_code = if (spec.backends.cuda)
generateCudaKernel(spec)
else
null;
pub const rocm_code = if (spec.backends.rocm)
generateRocmKernel(spec)
else
null;
pub const spirv_code = if (spec.backends.vulkan)
generateSpirvKernel(spec)
else
null;
pub const metal_code = if (spec.backends.metal)
generateMetalKernel(spec)
else
null;
pub fn launch(
backend: GpuBackend,
args: KernelArgs,
) !void {
switch (backend) {
.cuda => try launchCuda(cuda_code.?, args),
.rocm => try launchRocm(rocm_code.?, args),
.vulkan => try launchVulkan(spirv_code.?, args),
.metal => try launchMetal(metal_code.?, args),
.cpu => try launchCpu(spec, args),
else => return error.UnsupportedBackend,
}
}
};
}
// Example: Matrix multiplication kernel
pub const matmul = generateKernel(.{
.name = "matmul",
.backends = .{ .cuda = true, .rocm = true, .vulkan = true },
.params = .{
.a = .{ .type = f32, .layout = .row_major },
.b = .{ .type = f32, .layout = .row_major },
.c = .{ .type = f32, .layout = .row_major },
},
.body =
\\const idx = getGlobalId();
\\const row = idx / N;
\\const col = idx % N;
\\
\\var sum: f32 = 0.0;
\\for (0..K) |k| {
\\ sum += a[row * K + k] * b[k * N + col];
\\}
\\c[row * N + col] = sum;
,
});
pub const JitCompiler = struct {
cuda_compiler: ?*cuda.nvrtcCompiler,
rocm_compiler: ?*rocm.hiprtcCompiler,
vulkan_compiler: ?*vulkan.glslangCompiler,
pub fn compile(
self: *JitCompiler,
source: []const u8,
backend: GpuBackend,
options: CompileOptions,
) !CompiledKernel {
return switch (backend) {
.cuda => blk: {
const ptx = try self.cuda_compiler.?.compile(source, .{
.arch = options.arch,
.opt_level = options.optimization,
});
break :blk CompiledKernel{ .cuda = ptx };
},
.rocm => blk: {
const hsaco = try self.rocm_compiler.?.compile(source, .{
.arch = options.arch,
.opt_level = options.optimization,
});
break :blk CompiledKernel{ .rocm = hsaco };
},
.vulkan => blk: {
const spirv = try self.vulkan_compiler.?.compile(source, .{
.target = .vulkan1_3,
.opt_level = options.optimization,
});
break :blk CompiledKernel{ .vulkan = spirv };
},
else => error.UnsupportedBackend,
};
}
};
pub const UnifiedMemory = struct {
// Device memory tracking
allocations: std.AutoHashMap(DevicePtr, Allocation),
pub const Allocation = struct {
device: Device,
size: usize,
host_ptr: ?*anyopaque,
is_pinned: bool,
ref_count: u32,
};
// Allocate with migration support
pub fn allocate(
self: *UnifiedMemory,
size: usize,
hints: AllocationHints,
) !DevicePtr {
const device = selectDevice(hints);
const ptr = try device.backend.allocate(size);
// Track allocation
try self.allocations.put(ptr, .{
.device = device,
.size = size,
.host_ptr = if (hints.cpu_accessible)
try allocatePinnedHost(size)
else
null,
.is_pinned = hints.pinned,
.ref_count = 1,
});
return ptr;
}
// Migrate between devices
pub fn migrate(
self: *UnifiedMemory,
ptr: DevicePtr,
target: Device,
) !DevicePtr {
const alloc = self.allocations.get(ptr) orelse
return error.InvalidPointer;
if (alloc.device.id == target.id) {
return ptr; // Already on target
}
// Allocate on target
const new_ptr = try target.backend.allocate(alloc.size);
// Copy data
try copyBetweenDevices(
alloc.device,
ptr,
target,
new_ptr,
alloc.size,
);
// Update tracking
try self.allocations.put(new_ptr, .{
.device = target,
.size = alloc.size,
.host_ptr = alloc.host_ptr,
.is_pinned = alloc.is_pinned,
.ref_count = alloc.ref_count,
});
// Free old allocation
try alloc.device.backend.free(ptr);
_ = self.allocations.remove(ptr);
return new_ptr;
}
};
pub const ZeroCopy = struct {
// RDMA for inter-node
rdma_context: ?*rdma.Context,
// GPUDirect for intra-node
gpu_direct: ?*cuda.GPUDirect,
pub fn transfer(
self: *ZeroCopy,
src: DevicePtr,
src_device: Device,
dst: DevicePtr,
dst_device: Device,
size: usize,
) !void {
// Same device - use device copy
if (src_device.id == dst_device.id) {
return src_device.backend.copyOnDevice(src, dst, size);
}
// Same node - try GPUDirect
if (src_device.node == dst_device.node) {
if (self.gpu_direct) |gd| {
if (try gd.canTransfer(src_device, dst_device)) {
return gd.p2pCopy(src, dst, size);
}
}
}
// Different nodes - use RDMA
if (self.rdma_context) |rdma| {
const mr_src = try rdma.registerMemory(src, size);
const mr_dst = try rdma.registerMemory(dst, size);
defer rdma.deregisterMemory(mr_src);
defer rdma.deregisterMemory(mr_dst);
return rdma.write(mr_src, mr_dst, size);
}
// Fallback to staged copy through host
const tmp = try std.heap.page_allocator.alloc(u8, size);
defer std.heap.page_allocator.free(tmp);
try src_device.backend.copyToHost(src, tmp.ptr, size);
try dst_device.backend.copyFromHost(tmp.ptr, dst, size);
}
};
pub const GpuKernelActor = struct {
actor: Actor,
kernel: CompiledKernel,
device: Device,
stream: Stream,
// Fault tolerance
retry_count: u32 = 0,
max_retries: u32 = 3,
pub fn behavior(self: *Actor, msg: Message) !void {
const kernel_actor = @fieldParentPtr(GpuKernelActor, "actor", self);
switch (msg.payload) {
.launch => |args| {
// Try to launch kernel
const result = kernel_actor.tryLaunch(args);
if (result) |output| {
// Success - send result
try msg.from.send(.{
.kernel_result = output,
});
} else |err| {
// Handle GPU errors
if (err == error.GpuError) {
if (kernel_actor.retry_count < kernel_actor.max_retries) {
// Retry on different device
try kernel_actor.migrateAndRetry(args);
kernel_actor.retry_count += 1;
} else {
// Report failure to supervisor
try kernel_actor.actor.supervisor.send(.{
.child_failed = .{
.actor = self.id,
.reason = err,
},
});
}
}
}
},
.migrate => |target| {
try kernel_actor.migrateTo(target);
},
}
}
fn tryLaunch(self: *GpuKernelActor, args: LaunchArgs) !Output {
// Set device context
try self.device.backend.setDevice(self.device.id);
// Launch kernel
try self.kernel.launch(
self.device.backend,
args.grid,
args.blocks,
args.shared_memory,
self.stream,
args.params,
);
// Wait for completion
try self.stream.synchronize();
// Check for errors
if (try self.device.backend.getLastError()) |err| {
return err;
}
return args.output;
}
};
pub const StreamManager = struct {
streams: std.AutoHashMap(StreamId, Stream),
pools: [MAX_DEVICES]StreamPool,
pub const StreamPool = struct {
available: BoundedQueue(Stream, MAX_STREAMS_PER_DEVICE),
in_use: std.AutoHashMap(StreamId, Stream),
pub fn acquire(self: *StreamPool) !Stream {
if (self.available.pop()) |stream| {
try self.in_use.put(stream.id, stream);
return stream;
}
// Create new stream if under limit
if (self.in_use.count() < MAX_STREAMS_PER_DEVICE) {
const stream = try Stream.create();
try self.in_use.put(stream.id, stream);
return stream;
}
return error.NoStreamsAvailable;
}
pub fn release(self: *StreamPool, stream: Stream) !void {
_ = self.in_use.remove(stream.id);
try self.available.push(stream);
}
};
};
test "GPU backend detection" {
const devices = try DeviceDetector.detectDevices();
// Should have at least CPU fallback
try testing.expect(devices.len > 0);
for (devices) |device| {
std.debug.print("Found device: {} - {}\n", .{
device.backend,
device.name,
});
}
}
test "kernel compilation and execution" {
const backend = try selectBestBackend();
// Compile simple kernel
const kernel = try JitCompiler.compile(
\\__global__ void add(float* a, float* b, float* c, int n) {
\\ int i = blockIdx.x * blockDim.x + threadIdx.x;
\\ if (i < n) {
\\ c[i] = a[i] + b[i];
\\ }
\\}
, backend, .{});
// Allocate memory
const n = 1024;
const a = try backend.allocate(n * @sizeOf(f32));
const b = try backend.allocate(n * @sizeOf(f32));
const c = try backend.allocate(n * @sizeOf(f32));
defer backend.free(a);
defer backend.free(b);
defer backend.free(c);
// Initialize data
var host_a: [1024]f32 = undefined;
var host_b: [1024]f32 = undefined;
for (0..n) |i| {
host_a[i] = @floatFromInt(i);
host_b[i] = @floatFromInt(i * 2);
}
try backend.copyFromHost(&host_a, a, n * @sizeOf(f32));
try backend.copyFromHost(&host_b, b, n * @sizeOf(f32));
// Launch kernel
try kernel.launch(backend, .{
.grid = .{ .x = (n + 255) / 256 },
.blocks = .{ .x = 256 },
.params = .{ a, b, c, n },
});
// Check results
var host_c: [1024]f32 = undefined;
try backend.copyToHost(c, &host_c, n * @sizeOf(f32));
for (0..n) |i| {
try testing.expectApproxEqRel(
host_a[i] + host_b[i],
host_c[i],
1e-5,
);
}
}
test "GPU fault tolerance" {
var system = try ActorSystem.init(.{});
defer system.deinit();
// Create GPU kernel actor
const kernel_actor = try system.spawn(GpuKernelActor, .{
.kernel = matmul_kernel,
.device = try selectDevice(),
.max_retries = 3,
});
// Inject fault
system.fault_injector.injectGpuError(kernel_actor.device);
// Launch should retry and succeed
const result = try kernel_actor.call(.{
.launch = .{
.grid = .{ .x = 16, .y = 16 },
.params = .{ a, b, c },
},
}, 5000);
try testing.expect(result == .kernel_result);
}