Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Chapter 2: Architecture

System Layers


┌─────────────────────────────────────────┐
│ Application Layer │
│ (User Models, Training Scripts) │
├─────────────────────────────────────────┤
│ Actor System Layer │
│ (Scheduling, Messaging, Supervision) │
├─────────────────────────────────────────┤
│ GPU Abstraction Layer │
│ (CUDA, ROCm, Vulkan, Metal, CPU) │
├─────────────────────────────────────────┤
│ Memory Management Layer │
│ (Allocators, Zero-copy, RDMA) │
├─────────────────────────────────────────┤
│ Distribution Layer │
│ (Clustering, Gossip, Consensus) │
├─────────────────────────────────────────┤
│ Runtime Layer │
│ (OS Threads, I/O, Networking) │
└─────────────────────────────────────────┘

Core Components

Actor System

pub const ActorSystem = struct {
    // Fixed-size pools (Tiger Style)
    const MAX_ACTORS = 1_000_000;
    const MAX_WORKERS = 1024;

    actors: BoundedArray(Actor, MAX_ACTORS),
    workers: [MAX_WORKERS]Worker,
    scheduler: Scheduler,
    registry: ActorRegistry,

    pub fn init(config: Config) !ActorSystem {
        // Static allocation at startup
        var system: ActorSystem = undefined;
        system.actors = try BoundedArray(Actor, MAX_ACTORS).init();

        // Initialize workers
        const worker_count = @min(config.workers, MAX_WORKERS);
        for (0..worker_count) |i| {
            system.workers[i] = try Worker.init(i);
        }

        return system;
    }
};

Message Passing

pub const Message = struct {
    // Fixed-size message header
    id: u128,
    from: ActorId,
    to: ActorId,
    timestamp: i64,

    // Payload variants
    payload: union(enum) {
        // Small messages inline (Tiger Style: avoid allocation)
        small: [256]u8,

        // Large messages via zero-copy
        zero_copy: struct {
            ptr: [*]u8,
            len: usize,
            owner: ActorId,
        },

        // GPU memory reference
        gpu_ref: struct {
            device: GpuId,
            ptr: DevicePtr,
            size: usize,
        },

        // RDMA reference
        rdma: struct {
            node: NodeId,
            addr: u64,
            rkey: u32,
        },
    },
};

Scheduling

pub const Scheduler = struct {
    // Per-CPU run queues
    run_queues: [MAX_CPUS]RunQueue,

    // Work stealing
    steal_threshold: u32 = 3,
    last_steal: [MAX_CPUS]u64,

    pub fn schedule(self: *Scheduler) !*Actor {
        const cpu = getCpuId();

        // Try local queue first
        if (self.run_queues[cpu].pop()) |actor| {
            return actor;
        }

        // Try work stealing
        if (self.shouldSteal(cpu)) {
            return self.stealWork(cpu);
        }

        // Return idle actor
        return &idle_actor;
    }

    // Reduction-based preemption
    pub fn executeActor(self: *Scheduler, actor: *Actor) !void {
        actor.reductions = INITIAL_REDUCTIONS;

        while (actor.reductions > 0) {
            if (try actor.receiveMessage()) |msg| {
                try actor.handleMessage(msg);
                actor.reductions -= messageReductions(msg);
            } else {
                break;
            }
        }

        // Re-enqueue if still has work
        if (actor.hasMessages()) {
            try self.enqueue(actor);
        }
    }
};

Memory Architecture

Actor Memory Layout

pub const ActorMemory = struct {
    // Cache-aligned for performance
    mailbox: align(64) BoundedQueue(Message, 256),
    heap: align(4096) [ACTOR_HEAP_SIZE]u8,
    stack: align(16) [ACTOR_STACK_SIZE]u8,

    // Comptime-known offsets
    comptime {
        assert(@offsetOf(ActorMemory, "mailbox") % 64 == 0);
        assert(@offsetOf(ActorMemory, "heap") % 4096 == 0);
    }
};

GPU Memory Management

pub const GpuMemoryPool = struct {
    // Per-device pools
    pools: [MAX_GPUS]DevicePool,

    pub const DevicePool = struct {
        // Slab allocator for common sizes
        slabs: [SLAB_CLASSES]Slab,

        // Large allocations
        large_allocs: BTreeMap(usize, Allocation),

        pub fn alloc(self: *DevicePool, size: usize) !DevicePtr {
            // Use slab for small allocations
            if (size <= MAX_SLAB_SIZE) {
                const class = sizeToClass(size);
                return self.slabs[class].alloc();
            }

            // Fall back to large allocator
            return self.allocLarge(size);
        }
    };
};

Distribution Architecture

Cluster Topology

pub const ClusterNode = struct {
    id: NodeId,
    address: NetworkAddress,
    gpus: []GpuInfo,

    // Failure detection
    last_heartbeat: i64,
    phi_score: f64,  // Phi accrual failure detector

    // Load information
    load: LoadMetrics,

    pub const LoadMetrics = struct {
        cpu_usage: f32,
        memory_usage: f32,
        gpu_usage: [MAX_GPUS_PER_NODE]f32,
        network_bandwidth: f32,
    };
};

Consensus Layer

pub const ConsensusModule = struct {
    // Raft for critical decisions
    raft: RaftNode,

    // Operations requiring consensus
    pub fn requiresConsensus(op: Operation) bool {
        return switch (op) {
            .node_join, .node_leave => true,
            .supervisor_failover => true,
            .cluster_reconfiguration => true,
            else => false,
        };
    }
};

Fault Tolerance Architecture

Supervision Hierarchy

System Supervisor
├── GPU Supervisor
│   ├── CUDA Supervisor
│   │   └── Kernel Actors
│   └── ROCm Supervisor
│       └── Kernel Actors
├── Model Supervisor
│   ├── Layer Supervisors
│   │   └── Layer Actors
│   └── Optimizer Supervisor
│       └── Optimizer Actors
└── Data Supervisor
    ├── Dataset Actors
    └── Dataloader Actors

Restart Strategies

pub const RestartStrategy = enum {
    one_for_one,      // Restart only failed actor
    one_for_all,      // Restart all children
    rest_for_one,     // Restart failed and younger siblings
    simple_one_for_one, // Dynamic children
};

pub const RestartPolicy = struct {
    strategy: RestartStrategy,
    max_restarts: u32 = 10,
    time_window: i64 = 60_000_000_000, // 60 seconds
    backoff: BackoffStrategy = .exponential,
};

Performance Optimizations

Comptime Backend Selection

pub fn selectBackend(comptime device: Device) type {
    return switch (device) {
        .cuda => CudaBackend,
        .rocm => RocmBackend,
        .vulkan => VulkanBackend,
        .metal => MetalBackend,
        .cpu => CpuBackend,
    };
}

// Zero-cost abstraction
pub fn matmul(comptime device: Device) MatmulFn {
    const Backend = selectBackend(device);
    return Backend.matmul;
}

Cache-Aware Data Structures

pub const CacheAlignedArray = struct {
    pub fn init(comptime T: type, comptime len: usize) type {
        return struct {
            // Ensure cache line alignment
            data: [@divCeil(len * @sizeOf(T), 64) * 64]u8 align(64),

            pub fn get(self: *@This(), idx: usize) T {
                const ptr = @ptrCast([*]T, &self.data);
                return ptr[idx];
            }
        };
    }
};