Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Chapter 4: GPU Abstraction

Multi-Backend Architecture

Backend Interface

pub const GpuBackend = union(enum) {
    cuda: CudaBackend,
    rocm: RocmBackend,
    vulkan: VulkanBackend,
    metal: MetalBackend,
    opencl: OpenClBackend,
    cpu: CpuBackend,

    // Unified interface
    pub fn allocate(self: *GpuBackend, size: usize) !DevicePtr {
        return switch (self.*) {
            .cuda => |*b| b.cudaMalloc(size),
            .rocm => |*b| b.hipMalloc(size),
            .vulkan => |*b| b.vkAllocateMemory(size),
            .metal => |*b| b.newBuffer(size),
            .opencl => |*b| b.clCreateBuffer(size),
            .cpu => |*b| b.aligned_alloc(size),
        };
    }

    pub fn launch(
        self: *GpuBackend,
        kernel: Kernel,
        grid: Grid,
        args: []const *anyopaque,
    ) !void {
        return switch (self.*) {
            .cuda => |*b| b.cudaLaunchKernel(kernel, grid, args),
            .rocm => |*b| b.hipLaunchKernel(kernel, grid, args),
            .vulkan => |*b| b.vkCmdDispatch(kernel, grid),
            .metal => |*b| b.dispatchThreads(kernel, grid),
            .opencl => |*b| b.clEnqueueNDRangeKernel(kernel, grid),
            .cpu => |*b| b.parallel_for(kernel, grid),
        };
    }
};

Runtime Device Detection

pub const DeviceDetector = struct {
    pub fn detectDevices() ![]Device {
        var devices = ArrayList(Device).init();

        // Try CUDA
        if (cuda.isAvailable()) {
            const count = try cuda.deviceGetCount();
            for (0..count) |i| {
                const props = try cuda.getDeviceProperties(i);
                try devices.append(.{
                    .backend = .cuda,
                    .id = i,
                    .name = props.name,
                    .memory = props.totalGlobalMem,
                    .compute_capability = props.major * 10 + props.minor,
                });
            }
        }

        // Try ROCm
        if (rocm.isAvailable()) {
            const count = try rocm.hipGetDeviceCount();
            for (0..count) |i| {
                const props = try rocm.hipGetDeviceProperties(i);
                try devices.append(.{
                    .backend = .rocm,
                    .id = i,
                    .name = props.name,
                    .memory = props.totalGlobalMem,
                    .compute_capability = props.major * 10 + props.minor,
                });
            }
        }

        // Try Vulkan
        if (vulkan.isAvailable()) {
            const physical_devices = try vulkan.enumeratePhysicalDevices();
            for (physical_devices) |pd| {
                const props = try vulkan.getPhysicalDeviceProperties(pd);
                if (props.deviceType == .discrete_gpu or
                    props.deviceType == .integrated_gpu) {
                    try devices.append(.{
                        .backend = .vulkan,
                        .id = pd.handle,
                        .name = props.deviceName,
                        .memory = props.memorySize,
                    });
                }
            }
        }

        // Fallback to CPU
        if (devices.items.len == 0) {
            try devices.append(.{
                .backend = .cpu,
                .id = 0,
                .name = "CPU",
                .memory = getSystemMemory(),
            });
        }

        return devices.toOwnedSlice();
    }
};

Kernel Compilation

Comptime Kernel Generation

pub fn generateKernel(comptime spec: KernelSpec) type {
    return struct {
        // Generate backend-specific code
        pub const cuda_code = if (spec.backends.cuda)
            generateCudaKernel(spec)
        else
            null;

        pub const rocm_code = if (spec.backends.rocm)
            generateRocmKernel(spec)
        else
            null;

        pub const spirv_code = if (spec.backends.vulkan)
            generateSpirvKernel(spec)
        else
            null;

        pub const metal_code = if (spec.backends.metal)
            generateMetalKernel(spec)
        else
            null;

        pub fn launch(
            backend: GpuBackend,
            args: KernelArgs,
        ) !void {
            switch (backend) {
                .cuda => try launchCuda(cuda_code.?, args),
                .rocm => try launchRocm(rocm_code.?, args),
                .vulkan => try launchVulkan(spirv_code.?, args),
                .metal => try launchMetal(metal_code.?, args),
                .cpu => try launchCpu(spec, args),
                else => return error.UnsupportedBackend,
            }
        }
    };
}

// Example: Matrix multiplication kernel
pub const matmul = generateKernel(.{
    .name = "matmul",
    .backends = .{ .cuda = true, .rocm = true, .vulkan = true },
    .params = .{
        .a = .{ .type = f32, .layout = .row_major },
        .b = .{ .type = f32, .layout = .row_major },
        .c = .{ .type = f32, .layout = .row_major },
    },
    .body =
        \\const idx = getGlobalId();
        \\const row = idx / N;
        \\const col = idx % N;
        \\
        \\var sum: f32 = 0.0;
        \\for (0..K) |k| {
        \\    sum += a[row * K + k] * b[k * N + col];
        \\}
        \\c[row * N + col] = sum;
    ,
});

JIT Compilation

pub const JitCompiler = struct {
    cuda_compiler: ?*cuda.nvrtcCompiler,
    rocm_compiler: ?*rocm.hiprtcCompiler,
    vulkan_compiler: ?*vulkan.glslangCompiler,

    pub fn compile(
        self: *JitCompiler,
        source: []const u8,
        backend: GpuBackend,
        options: CompileOptions,
    ) !CompiledKernel {
        return switch (backend) {
            .cuda => blk: {
                const ptx = try self.cuda_compiler.?.compile(source, .{
                    .arch = options.arch,
                    .opt_level = options.optimization,
                });
                break :blk CompiledKernel{ .cuda = ptx };
            },
            .rocm => blk: {
                const hsaco = try self.rocm_compiler.?.compile(source, .{
                    .arch = options.arch,
                    .opt_level = options.optimization,
                });
                break :blk CompiledKernel{ .rocm = hsaco };
            },
            .vulkan => blk: {
                const spirv = try self.vulkan_compiler.?.compile(source, .{
                    .target = .vulkan1_3,
                    .opt_level = options.optimization,
                });
                break :blk CompiledKernel{ .vulkan = spirv };
            },
            else => error.UnsupportedBackend,
        };
    }
};

Memory Management

Unified Memory Abstraction

pub const UnifiedMemory = struct {
    // Device memory tracking
    allocations: std.AutoHashMap(DevicePtr, Allocation),

    pub const Allocation = struct {
        device: Device,
        size: usize,
        host_ptr: ?*anyopaque,
        is_pinned: bool,
        ref_count: u32,
    };

    // Allocate with migration support
    pub fn allocate(
        self: *UnifiedMemory,
        size: usize,
        hints: AllocationHints,
    ) !DevicePtr {
        const device = selectDevice(hints);

        const ptr = try device.backend.allocate(size);

        // Track allocation
        try self.allocations.put(ptr, .{
            .device = device,
            .size = size,
            .host_ptr = if (hints.cpu_accessible)
                try allocatePinnedHost(size)
            else
                null,
            .is_pinned = hints.pinned,
            .ref_count = 1,
        });

        return ptr;
    }

    // Migrate between devices
    pub fn migrate(
        self: *UnifiedMemory,
        ptr: DevicePtr,
        target: Device,
    ) !DevicePtr {
        const alloc = self.allocations.get(ptr) orelse
            return error.InvalidPointer;

        if (alloc.device.id == target.id) {
            return ptr; // Already on target
        }

        // Allocate on target
        const new_ptr = try target.backend.allocate(alloc.size);

        // Copy data
        try copyBetweenDevices(
            alloc.device,
            ptr,
            target,
            new_ptr,
            alloc.size,
        );

        // Update tracking
        try self.allocations.put(new_ptr, .{
            .device = target,
            .size = alloc.size,
            .host_ptr = alloc.host_ptr,
            .is_pinned = alloc.is_pinned,
            .ref_count = alloc.ref_count,
        });

        // Free old allocation
        try alloc.device.backend.free(ptr);
        _ = self.allocations.remove(ptr);

        return new_ptr;
    }
};

Zero-Copy Optimizations

pub const ZeroCopy = struct {
    // RDMA for inter-node
    rdma_context: ?*rdma.Context,

    // GPUDirect for intra-node
    gpu_direct: ?*cuda.GPUDirect,

    pub fn transfer(
        self: *ZeroCopy,
        src: DevicePtr,
        src_device: Device,
        dst: DevicePtr,
        dst_device: Device,
        size: usize,
    ) !void {
        // Same device - use device copy
        if (src_device.id == dst_device.id) {
            return src_device.backend.copyOnDevice(src, dst, size);
        }

        // Same node - try GPUDirect
        if (src_device.node == dst_device.node) {
            if (self.gpu_direct) |gd| {
                if (try gd.canTransfer(src_device, dst_device)) {
                    return gd.p2pCopy(src, dst, size);
                }
            }
        }

        // Different nodes - use RDMA
        if (self.rdma_context) |rdma| {
            const mr_src = try rdma.registerMemory(src, size);
            const mr_dst = try rdma.registerMemory(dst, size);
            defer rdma.deregisterMemory(mr_src);
            defer rdma.deregisterMemory(mr_dst);

            return rdma.write(mr_src, mr_dst, size);
        }

        // Fallback to staged copy through host
        const tmp = try std.heap.page_allocator.alloc(u8, size);
        defer std.heap.page_allocator.free(tmp);

        try src_device.backend.copyToHost(src, tmp.ptr, size);
        try dst_device.backend.copyFromHost(tmp.ptr, dst, size);
    }
};

Actor-Based GPU Execution

GPU Kernel Actor

pub const GpuKernelActor = struct {
    actor: Actor,
    kernel: CompiledKernel,
    device: Device,
    stream: Stream,

    // Fault tolerance
    retry_count: u32 = 0,
    max_retries: u32 = 3,

    pub fn behavior(self: *Actor, msg: Message) !void {
        const kernel_actor = @fieldParentPtr(GpuKernelActor, "actor", self);

        switch (msg.payload) {
            .launch => |args| {
                // Try to launch kernel
                const result = kernel_actor.tryLaunch(args);

                if (result) |output| {
                    // Success - send result
                    try msg.from.send(.{
                        .kernel_result = output,
                    });
                } else |err| {
                    // Handle GPU errors
                    if (err == error.GpuError) {
                        if (kernel_actor.retry_count < kernel_actor.max_retries) {
                            // Retry on different device
                            try kernel_actor.migrateAndRetry(args);
                            kernel_actor.retry_count += 1;
                        } else {
                            // Report failure to supervisor
                            try kernel_actor.actor.supervisor.send(.{
                                .child_failed = .{
                                    .actor = self.id,
                                    .reason = err,
                                },
                            });
                        }
                    }
                }
            },
            .migrate => |target| {
                try kernel_actor.migrateTo(target);
            },
        }
    }

    fn tryLaunch(self: *GpuKernelActor, args: LaunchArgs) !Output {
        // Set device context
        try self.device.backend.setDevice(self.device.id);

        // Launch kernel
        try self.kernel.launch(
            self.device.backend,
            args.grid,
            args.blocks,
            args.shared_memory,
            self.stream,
            args.params,
        );

        // Wait for completion
        try self.stream.synchronize();

        // Check for errors
        if (try self.device.backend.getLastError()) |err| {
            return err;
        }

        return args.output;
    }
};

Stream Management

pub const StreamManager = struct {
    streams: std.AutoHashMap(StreamId, Stream),
    pools: [MAX_DEVICES]StreamPool,

    pub const StreamPool = struct {
        available: BoundedQueue(Stream, MAX_STREAMS_PER_DEVICE),
        in_use: std.AutoHashMap(StreamId, Stream),

        pub fn acquire(self: *StreamPool) !Stream {
            if (self.available.pop()) |stream| {
                try self.in_use.put(stream.id, stream);
                return stream;
            }

            // Create new stream if under limit
            if (self.in_use.count() < MAX_STREAMS_PER_DEVICE) {
                const stream = try Stream.create();
                try self.in_use.put(stream.id, stream);
                return stream;
            }

            return error.NoStreamsAvailable;
        }

        pub fn release(self: *StreamPool, stream: Stream) !void {
            _ = self.in_use.remove(stream.id);
            try self.available.push(stream);
        }
    };
};

Testing GPU Code

test "GPU backend detection" {
    const devices = try DeviceDetector.detectDevices();

    // Should have at least CPU fallback
    try testing.expect(devices.len > 0);

    for (devices) |device| {
        std.debug.print("Found device: {} - {}\n", .{
            device.backend,
            device.name,
        });
    }
}

test "kernel compilation and execution" {
    const backend = try selectBestBackend();

    // Compile simple kernel
    const kernel = try JitCompiler.compile(
        \\__global__ void add(float* a, float* b, float* c, int n) {
        \\    int i = blockIdx.x * blockDim.x + threadIdx.x;
        \\    if (i < n) {
        \\        c[i] = a[i] + b[i];
        \\    }
        \\}
    , backend, .{});

    // Allocate memory
    const n = 1024;
    const a = try backend.allocate(n * @sizeOf(f32));
    const b = try backend.allocate(n * @sizeOf(f32));
    const c = try backend.allocate(n * @sizeOf(f32));
    defer backend.free(a);
    defer backend.free(b);
    defer backend.free(c);

    // Initialize data
    var host_a: [1024]f32 = undefined;
    var host_b: [1024]f32 = undefined;
    for (0..n) |i| {
        host_a[i] = @floatFromInt(i);
        host_b[i] = @floatFromInt(i * 2);
    }

    try backend.copyFromHost(&host_a, a, n * @sizeOf(f32));
    try backend.copyFromHost(&host_b, b, n * @sizeOf(f32));

    // Launch kernel
    try kernel.launch(backend, .{
        .grid = .{ .x = (n + 255) / 256 },
        .blocks = .{ .x = 256 },
        .params = .{ a, b, c, n },
    });

    // Check results
    var host_c: [1024]f32 = undefined;
    try backend.copyToHost(c, &host_c, n * @sizeOf(f32));

    for (0..n) |i| {
        try testing.expectApproxEqRel(
            host_a[i] + host_b[i],
            host_c[i],
            1e-5,
        );
    }
}

test "GPU fault tolerance" {
    var system = try ActorSystem.init(.{});
    defer system.deinit();

    // Create GPU kernel actor
    const kernel_actor = try system.spawn(GpuKernelActor, .{
        .kernel = matmul_kernel,
        .device = try selectDevice(),
        .max_retries = 3,
    });

    // Inject fault
    system.fault_injector.injectGpuError(kernel_actor.device);

    // Launch should retry and succeed
    const result = try kernel_actor.call(.{
        .launch = .{
            .grid = .{ .x = 16, .y = 16 },
            .params = .{ a, b, c },
        },
    }, 5000);

    try testing.expect(result == .kernel_result);
}