Chapter 8: Tiger Style Safety & Performance

Safety First

No Undefined Behavior

// ❌ BAD: Using undefined
pub fn badInit() Actor {
    var actor: Actor = undefined;  // NEVER do this
    actor.id = generateId();
    // What about other fields? Undefined behavior!
    return actor;
}

// ✅ GOOD: Explicit initialization
pub fn goodInit() Actor {
    return Actor{
        .id = generateId(),
        .state = .initializing,
        .mailbox = BoundedQueue.init(),
        .reductions = INITIAL_REDUCTIONS,
        .supervisor = null,
        .children = BoundedArray.init(),
        // Every field explicitly initialized
    };
}

Fixed Limits Everywhere

pub const Limits = struct {
    // Tiger Style: All limits are fixed at compile time
    pub const MAX_ACTORS = 1_000_000;
    pub const MAX_MAILBOX_SIZE = 256;
    pub const MAX_MESSAGE_SIZE = 64 * 1024;  // 64KB
    pub const MAX_CHILDREN = 100;
    pub const MAX_REDUCTIONS = 2000;
    pub const MAX_RETRIES = 10;
    pub const MAX_DEVICES = 16;
    pub const ACTOR_HEAP_SIZE = 1024 * 1024;  // 1MB
    pub const ACTOR_STACK_SIZE = 64 * 1024;   // 64KB
};

// All data structures bounded
pub const BoundedQueue = struct(comptime T: type, comptime max: usize) {
    items: [max]T,
    head: usize = 0,
    tail: usize = 0,
    count: usize = 0,

    pub fn push(self: *@This(), item: T) !void {
        if (self.count >= max) {
            return error.QueueFull;
        }

        self.items[self.tail] = item;
        self.tail = (self.tail + 1) % max;
        self.count += 1;
    }

    pub fn pop(self: *@This()) ?T {
        if (self.count == 0) return null;

        const item = self.items[self.head];
        self.head = (self.head + 1) % max;
        self.count -= 1;

        return item;
    }
};

Assertions & Invariants

pub const ActorSystem = struct {
    actors: []Actor,
    worker_count: u32,

    pub fn spawn(self: *ActorSystem, behavior: Behavior) !ActorId {
        // Precondition assertions
        assert(self.worker_count > 0);
        assert(self.actors.len < Limits.MAX_ACTORS);

        const actor = try self.allocateActor();

        // Invariant: actor must be in valid state
        assert(actor.state == .initializing or actor.state == .running);

        // Postcondition: actor is registered
        defer assert(self.registry.contains(actor.id));

        return actor.id;
    }

    // Pair assertions for critical data
    pub fn transferMessage(self: *ActorSystem, msg: Message) !void {
        const sender_before = self.getActor(msg.from).mailbox.count;
        const receiver_before = self.getActor(msg.to).mailbox.count;

        try self.doTransfer(msg);

        const sender_after = self.getActor(msg.from).mailbox.count;
        const receiver_after = self.getActor(msg.to).mailbox.count;

        // Pair assertion: one message moved
        assert(sender_after == sender_before - 1);
        assert(receiver_after == receiver_before + 1);
    }
};

Error Handling

// All errors are explicit
pub const GpuError = error{
    OutOfMemory,
    DeviceLost,
    KernelTimeout,
    InvalidConfiguration,
    UnsupportedOperation,
};

pub fn launchKernel(kernel: Kernel, device: Device) GpuError!void {
    // Check preconditions
    if (!device.isHealthy()) {
        return error.DeviceLost;
    }

    if (kernel.memory_requirement > device.available_memory) {
        return error.OutOfMemory;
    }

    // Fail fast on programmer errors
    assert(kernel.grid.x > 0);  // This should never be 0
    assert(kernel.grid.y > 0);
    assert(kernel.grid.z > 0);

    // Handle all possible failures
    const result = device.backend.launch(kernel) catch |err| {
        switch (err) {
            error.CudaError => return error.DeviceLost,
            error.Timeout => return error.KernelTimeout,
            else => return err,
        }
    };

    // Verify success
    assert(result.status == .success);
}

Performance by Design

Napkin Math

// Document performance assumptions
pub const PerformanceModel = struct {
    // Memory bandwidth: 900 GB/s (A100)
    // L2 cache: 40 MB
    // SM count: 108
    // Registers per SM: 65536

    pub fn estimateMatmulTime(m: usize, n: usize, k: usize) f64 {
        // FLOPs = 2 * M * N * K
        const flops = 2 * m * n * k;

        // Memory transfers = M*K + K*N + M*N elements
        const memory_bytes = @sizeOf(f32) * (m * k + k * n + m * n);

        // Assuming peak performance
        const compute_time = @as(f64, flops) / (19.5 * 1e12);  // 19.5 TFLOPS
        const memory_time = @as(f64, memory_bytes) / (900 * 1e9);  // 900 GB/s

        // Actual time is max of compute and memory bound
        return @max(compute_time, memory_time);
    }

    pub fn canFitInSharedMemory(tensor_size: usize) bool {
        const shared_mem_per_sm = 164 * 1024;  // 164 KB on A100
        return tensor_size <= shared_mem_per_sm;
    }
};

Batch Operations

pub const BatchProcessor = struct {
    // Amortize expensive operations
    pub fn processBatch(
        messages: []const Message,
        actor: *Actor,
    ) !void {
        // Sort messages by type for better branch prediction
        var sorted = try allocator.alloc(Message, messages.len);
        defer allocator.free(sorted);

        std.sort.sort(Message, sorted, {}, struct {
            fn lessThan(ctx: void, a: Message, b: Message) bool {
                _ = ctx;
                return @intFromEnum(a.type) < @intFromEnum(b.type);
            }
        }.lessThan);

        // Process in batches by type
        var i: usize = 0;
        while (i < sorted.len) {
            const msg_type = sorted[i].type;
            var j = i + 1;

            // Find end of same-type messages
            while (j < sorted.len and sorted[j].type == msg_type) : (j += 1) {}

            // Process batch
            try processSameType(sorted[i..j], actor);

            i = j;
        }
    }

    // Batch GPU operations
    pub fn launchKernelBatch(
        kernels: []const Kernel,
        device: Device,
    ) !void {
        // Use CUDA streams for concurrent execution
        var streams: [MAX_STREAMS]Stream = undefined;
        for (0..@min(kernels.len, MAX_STREAMS)) |i| {
            streams[i] = try device.createStream();
        }
        defer for (streams) |s| device.destroyStream(s);

        // Launch all kernels asynchronously
        for (kernels, 0..) |kernel, i| {
            const stream = streams[i % MAX_STREAMS];
            try device.launchAsync(kernel, stream);
        }

        // Wait for all to complete
        for (streams[0..@min(kernels.len, MAX_STREAMS)]) |stream| {
            try stream.synchronize();
        }
    }
};

Memory Efficiency

pub const MemoryEfficientTensor = struct {
    // Use minimum precision needed
    data: union(enum) {
        f32: []f32,
        f16: []f16,
        bf16: []bf16,
        i8: []i8,   // Quantized
        i4: []u8,   // 4-bit packed
    },

    shape: TensorShape,

    // Lazy allocation
    allocated: bool = false,

    pub fn allocate(self: *@This()) !void {
        if (self.allocated) return;

        const num_elements = self.shape.numElements();

        switch (self.data) {
            .f32 => |*d| d.* = try allocator.alloc(f32, num_elements),
            .f16 => |*d| d.* = try allocator.alloc(f16, num_elements),
            .bf16 => |*d| d.* = try allocator.alloc(bf16, num_elements),
            .i8 => |*d| d.* = try allocator.alloc(i8, num_elements),
            .i4 => |*d| d.* = try allocator.alloc(u8, (num_elements + 1) / 2),
        }

        self.allocated = true;
    }

    // Memory pooling
    pub fn deallocate(self: *@This(), pool: *TensorPool) !void {
        if (!self.allocated) return;

        // Return to pool instead of freeing
        try pool.return(self);
        self.allocated = false;
    }
};

Cache Optimization

pub const CacheOptimized = struct {
    // Structure padding for cache alignment
    pub const CacheAlignedActor = struct {
        // Hot data on same cache line
        hot: struct {
            id: ActorId,
            state: State,
            reductions: i32,
            message_count: u32,
            padding: [40]u8,  // Pad to 64 bytes
        } align(64),

        // Cold data on separate cache lines
        cold: struct {
            supervisor: ?ActorId,
            children: BoundedArray(ActorId, MAX_CHILDREN),
            metadata: Metadata,
        } align(64),
    };

    // Prefetching
    pub fn processActors(actors: []Actor) !void {
        // Prefetch next actors while processing current
        for (actors, 0..) |*actor, i| {
            // Prefetch next few actors
            if (i + 1 < actors.len) {
                @prefetch(&actors[i + 1], .{
                    .rw = .read,
                    .locality = 1,
                    .cache = .data,
                });
            }

            // Process current actor
            try actor.process();
        }
    }
};

Comptime Optimization

CPU Feature Detection

pub const CpuOptimized = struct {
    // Comptime generation of optimized code paths
    pub fn matmul(comptime features: CpuFeatures) type {
        return struct {
            pub fn compute(a: []f32, b: []f32, c: []f32, m: usize, n: usize, k: usize) void {
                if (comptime features.avx512) {
                    matmulAVX512(a, b, c, m, n, k);
                } else if (comptime features.avx2) {
                    matmulAVX2(a, b, c, m, n, k);
                } else if (comptime features.sse4) {
                    matmulSSE4(a, b, c, m, n, k);
                } else {
                    matmulScalar(a, b, c, m, n, k);
                }
            }
        };
    }

    // Runtime dispatch to comptime-optimized versions
    pub fn createOptimizedMatmul() MatmulFn {
        const cpu = detectCpuFeatures();

        if (cpu.avx512) {
            return matmul(.{ .avx512 = true }).compute;
        } else if (cpu.avx2) {
            return matmul(.{ .avx2 = true }).compute;
        } else if (cpu.sse4) {
            return matmul(.{ .sse4 = true }).compute;
        } else {
            return matmul(.{}).compute;
        }
    }
};

GPU Kernel Generation

pub fn generateOptimizedKernel(comptime spec: KernelSpec) type {
    return struct {
        // Generate different versions for different architectures
        pub const cuda_sm70 = if (spec.targets.cuda_sm70)
            generateCudaKernel(spec, .sm_70)
        else
            null;

        pub const cuda_sm80 = if (spec.targets.cuda_sm80)
            generateCudaKernel(spec, .sm_80)
        else
            null;

        pub const cuda_sm90 = if (spec.targets.cuda_sm90)
            generateCudaKernel(spec, .sm_90)
        else
            null;

        // Runtime dispatch
        pub fn launch(device: Device, args: anytype) !void {
            switch (device.compute_capability) {
                90 => try launchKernel(cuda_sm90.?, device, args),
                80...89 => try launchKernel(cuda_sm80.?, device, args),
                70...79 => try launchKernel(cuda_sm70.?, device, args),
                else => return error.UnsupportedDevice,
            }
        }
    };
}

Safety Testing

test "bounded operations never overflow" {
    var queue = BoundedQueue(u32, 10).init();

    // Fill queue
    for (0..10) |i| {
        try queue.push(@intCast(i));
    }

    // Should fail on overflow
    const result = queue.push(10);
    try testing.expectError(error.QueueFull, result);

    // Should handle empty queue
    for (0..10) |_| {
        _ = queue.pop();
    }

    try testing.expect(queue.pop() == null);
}

test "assertions catch invariant violations" {
    if (builtin.mode != .Debug) return;  // Assertions only in debug

    var system = try ActorSystem.init(.{
        .worker_count = 0,  // Invalid!
    });

    // Should panic in debug mode
    const result = std.debug.panic_test(
        system.spawn,
        .{testBehavior},
    );

    try testing.expect(result == .panic);
}

test "memory limits are enforced" {
    var actor = try Actor.init();

    // Try to allocate more than allowed
    const huge_size = Limits.ACTOR_HEAP_SIZE + 1;
    const result = actor.heap.alloc(u8, huge_size);

    try testing.expectError(error.OutOfMemory, result);
}

Performance Validation

test "performance meets requirements" {
    const start = std.time.nanoTimestamp();

    // Message passing latency
    {
        var system = try ActorSystem.init(.{});
        const actor = try system.spawn(echoActor, .{});

        const msg_start = std.time.nanoTimestamp();
        try actor.send(.{ .data = "test" });
        _ = try actor.receive(100);
        const msg_elapsed = std.time.nanoTimestamp() - msg_start;

        // Should be under 1 microsecond
        try testing.expect(msg_elapsed < 1000);
    }

    // GPU kernel launch overhead
    {
        const device = try selectDevice();
        const kernel = simpleKernel;

        const launch_start = std.time.nanoTimestamp();
        try kernel.launch(device, .{});
        const launch_elapsed = std.time.nanoTimestamp() - launch_start;

        // Should be under 10 microseconds
        try testing.expect(launch_elapsed < 10_000);
    }

    const total_elapsed = std.time.nanoTimestamp() - start;
    std.debug.print("Performance test completed in {}ns\n", .{total_elapsed});
}

Tiger Style Checklist

No undefined for uninitialized data
All loops have fixed bounds
All arrays/queues have maximum sizes
Functions under 70 lines
Assertions for preconditions/invariants
Fail fast on programmer errors
All errors handled explicitly
Compiler warnings as errors
No hidden allocations
Batch operations where possible
Cache-aligned data structures
Comptime optimization with runtime dispatch
Performance napkin math documented
Memory pools instead of allocation/free
Prefetching for sequential access

Keyboard shortcuts

Ziggy BEAMdust