Documentation

MLX GPU Backend

The MLX backend provides GPU acceleration for Lux Consensus on Apple Silicon (M1/M2/M3/M4) using Apple's MLX framework, achieving up to 25x faster batch processing compared to CPU-only mode.

Overview

MLX (Machine Learning Accelerator) is Apple's framework for high-performance machine learning on Apple Silicon. The Lux Consensus MLX backend leverages the Neural Engine and GPU to accelerate:

Batch vote processing: Process thousands of votes in parallel
Block validation: Hardware-accelerated cryptographic operations
AI consensus models: Neural network inference on-device
Pattern matching: Fast similarity searches in large datasets

Prerequisites

Hardware Requirements

Apple Silicon Mac (M1, M2, M3, or M4 chip)
macOS 13.0+ (Ventura or newer)
8GB+ unified memory (16GB+ recommended for large networks)

Software Requirements

# Install MLX framework
pip3 install mlx

# Install MLX C++ bindings (for C++ SDK)
brew install mlx-cpp

# Verify installation
python3 -c "import mlx.core as mx; print(mx.__version__)"

Configuration

JSON Configuration File

Create mlx_backend.json:

{
  "backend": "mlx",
  "mlx_config": {
    "model_path": "/models/consensus/mlx_model.bin",
    "device_type": "metal",
    "batch_size": 32,
    "enable_quantization": true
  },
  "performance": {
    "cache_size": 5000,
    "batch_processing": true,
    "parallel_ops": 8
  },
  "debug": false
}

Configuration Parameters

MLX Config

model_path: Path to pre-trained MLX model for AI consensus
device_type: "metal" for GPU, "cpu" for fallback
batch_size: Number of operations to batch (16-64 optimal)
enable_quantization: Use int8 quantization for faster inference

Performance

cache_size: Number of recent blocks to cache on GPU
batch_processing: Enable batched operations (recommended)
parallel_ops: Number of parallel Metal compute pipelines

Usage

C++ Integration

#include <lux/consensus.hpp>

#ifdef HAS_MLX
#include <mlx/mlx.h>
#endif

int main() {
    #ifdef HAS_MLX
    // Enable MLX GPU backend
    mlx::core::set_default_device(mlx::core::Device::gpu());

    std::cout << "MLX GPU acceleration enabled\n";
    std::cout << "Device: " << mlx::core::default_device() << "\n";
    std::cout << "Memory: " << mlx::core::metal::get_peak_memory() / (1024*1024) << " MB\n";
    #endif

    // Create consensus engine (automatically uses MLX if enabled)
    lux::consensus::ConsensusParams params {
        .k = 20,
        .alpha_preference = 15,
        .alpha_confidence = 15,
        .beta = 20,
        .concurrent_polls = 100,  // Higher concurrency with GPU
        .max_outstanding_items = 10000  // GPU can handle more
    };

    auto engine = lux::consensus::Consensus::create(
        lux::consensus::EngineType::DAG,
        params
    );

    // Process large batches efficiently on GPU
    std::vector<lux::consensus::Vote> votes(10000);
    // ... populate votes ...

    auto start = std::chrono::high_resolution_clock::now();
    engine->process_votes_batch(votes);
    auto end = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "Processed " << votes.size() << " votes in " << duration.count() << " μs\n";
    std::cout << "Throughput: " << (votes.size() * 1000000.0 / duration.count()) << " votes/sec\n";

    return 0;
}

CMake Build Configuration

cmake_minimum_required(VERSION 3.20)
project(my_consensus_app)

# Find MLX
find_package(MLX REQUIRED)

# Create executable
add_executable(my_app main.cpp)

# Link Lux Consensus and MLX
target_link_libraries(my_app PRIVATE
    Lux::lux_consensus
    MLX::MLX
)

# Enable MLX features
target_compile_definitions(my_app PRIVATE HAS_MLX)
target_compile_features(my_app PRIVATE cxx_std_20)

Go Integration

package main

import (
    "encoding/json"
    "os"
    "github.com/luxfi/consensus/engine/core"
)

type MLXConfig struct {
    Backend    string `json:"backend"`
    MLXConfig  struct {
        DeviceType string `json:"device_type"`
        BatchSize  int    `json:"batch_size"`
    } `json:"mlx_config"`
}

func main() {
    // Load MLX configuration
    data, _ := os.ReadFile("config/examples/mlx_backend.json")
    var mlxConfig MLXConfig
    json.Unmarshal(data, &mlxConfig)

    // Create engine with MLX backend
    config := core.DefaultConfig()
    config.Backend = "mlx"
    config.MLXDeviceType = mlxConfig.MLXConfig.DeviceType
    config.MLXBatchSize = mlxConfig.MLXConfig.BatchSize

    engine := core.NewChain(config)
    // Engine automatically uses GPU when available
}

Python Integration

import lux_consensus as lux
import json

# Load MLX configuration
with open('mlx_backend.json') as f:
    mlx_config = json.load(f)

# Create consensus with MLX backend
config = lux.ConsensusConfig(
    k=20,
    alpha_preference=15,
    alpha_confidence=15,
    beta=20,
    concurrent_polls=100,     # Higher with GPU
    max_outstanding_items=10000  # GPU can handle more
)

engine = lux.ConsensusEngine(config)

# Process votes (automatically uses GPU for large batches)
votes = [
    lux.Vote(
        voter_id=bytes([i] * 32),
        block_id=bytes([0] * 32),
        is_preference=False
    )
    for i in range(10000)
]

import time
start = time.time()
for vote in votes:
    engine.process_vote(vote)
elapsed = time.time() - start

print(f"Processed {len(votes)} votes in {elapsed:.2f}s")
print(f"Throughput: {len(votes)/elapsed:.0f} votes/sec")

Performance Benchmarks

CPU vs GPU Comparison

M1 Max (10-core CPU, 32-core GPU)

Operation	CPU Mode	MLX GPU Mode	Speedup
Single Vote	800 ns	850 ns	0.94x (GPU overhead)
Batch 100	50 μs	8 μs	6.25x
Batch 1K	480 μs	35 μs	13.7x
Batch 10K	4.8 ms	190 μs	25.3x

M3 Max (16-core CPU, 40-core GPU)

Operation	CPU Mode	MLX GPU Mode	Speedup
Batch 100	45 μs	6 μs	7.5x
Batch 1K	420 μs	25 μs	16.8x
Batch 10K	4.2 ms	140 μs	30x

Memory Usage

CPU Mode:     ~100 MB for 10K blocks
MLX GPU Mode: ~250 MB (includes GPU buffers)
Peak Memory:  ~400 MB during large batch processing

Optimal Batch Sizes

< 10 operations: Use CPU (GPU overhead not worth it)
10-100 operations: ~5x speedup with GPU
100-1000 operations: ~10-15x speedup
1000+ operations: ~20-30x speedup (optimal)

Advanced Features

Custom ML Models

Train custom consensus models for your network:

import mlx.core as mx
import mlx.nn as nn

class ConsensusModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(32, 64)
        self.layer2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)

    def __call__(self, x):
        x = nn.relu(self.layer1(x))
        x = nn.relu(self.layer2(x))
        return nn.sigmoid(self.output(x))

# Train model on historical consensus data
model = ConsensusModel()
# ... training code ...

# Export for C++ backend
model.save_weights("mlx_model.bin")

Adaptive Batching

Automatically adjust batch size based on load:

class AdaptiveBatchProcessor {
private:
    std::unique_ptr<lux::consensus::Consensus> engine_;
    std::vector<lux::consensus::Vote> vote_buffer_;
    size_t optimal_batch_size_ = 32;
    std::chrono::milliseconds batch_timeout_{10};

public:
    void process_vote(const lux::consensus::Vote& vote) {
        vote_buffer_.push_back(vote);

        // Flush if buffer reaches optimal size
        if (vote_buffer_.size() >= optimal_batch_size_) {
            flush();
        }
    }

    void flush() {
        if (vote_buffer_.empty()) return;

        auto start = std::chrono::high_resolution_clock::now();
        engine_->process_votes_batch(vote_buffer_);
        auto end = std::chrono::high_resolution_clock::now();

        // Adjust batch size based on performance
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
        double throughput = vote_buffer_.size() * 1000000.0 / duration.count();

        // Increase batch size if throughput is good
        if (throughput > 1000000.0 && optimal_batch_size_ < 128) {
            optimal_batch_size_ *= 2;
        }
        // Decrease if throughput is poor
        else if (throughput < 100000.0 && optimal_batch_size_ > 16) {
            optimal_batch_size_ /= 2;
        }

        vote_buffer_.clear();
    }
};

GPU Memory Monitoring

#ifdef HAS_MLX
#include <mlx/mlx.h>

void monitor_gpu_memory() {
    auto peak = mlx::core::metal::get_peak_memory();
    auto active = mlx::core::metal::get_active_memory();
    auto cache = mlx::core::metal::get_cache_memory();

    std::cout << "Peak memory:   " << peak / (1024*1024) << " MB\n";
    std::cout << "Active memory: " << active / (1024*1024) << " MB\n";
    std::cout << "Cache memory:  " << cache / (1024*1024) << " MB\n";

    // Reset peak if needed
    if (peak > 2LL * 1024 * 1024 * 1024) {  // 2GB threshold
        mlx::core::metal::reset_peak_memory();
    }
}
#endif

Debugging and Profiling

Enable MLX Debugging

#ifdef HAS_MLX
// Enable MLX debugging
setenv("MLX_DEBUG", "1", 1);

// Enable Metal API validation
setenv("MTL_DEBUG_LAYER", "1", 1);
setenv("MTL_SHADER_VALIDATION", "1", 1);
#endif

Profile GPU Performance

# Use Xcode Instruments
instruments -t "Metal System Trace" ./my_consensus_app

# Or use built-in profiler
MLX_PROFILE=1 ./my_consensus_app

Trace Metal Calls

#ifdef HAS_MLX
// Set trace callback
mlx::core::metal::set_trace_callback([](const std::string& msg) {
    std::cout << "[MLX] " << msg << "\n";
});
#endif

Best Practices

1. Batch Operations

✅ Good: Batch operations for GPU efficiency

std::vector<Vote> votes(1000);
engine->process_votes_batch(votes);  // Efficient

❌ Bad: Process votes individually

for (const auto& vote : votes) {
    engine->process_vote(vote);  // Inefficient with GPU
}

2. Memory Management

✅ Good: Pre-allocate buffers

std::vector<Vote> vote_buffer;
vote_buffer.reserve(1000);  // Pre-allocate

❌ Bad: Dynamic resizing

std::vector<Vote> vote_buffer;  // Will resize many times

3. Device Selection

✅ Good: Check GPU availability

#ifdef HAS_MLX
if (mlx::core::metal::is_available()) {
    mlx::core::set_default_device(mlx::core::Device::gpu());
} else {
    std::cerr << "GPU not available, using CPU\n";
}
#endif

4. Error Handling

✅ Good: Handle GPU errors gracefully

try {
    engine->process_votes_batch(votes);
} catch (const mlx::core::exception& e) {
    std::cerr << "MLX error: " << e.what() << "\n";
    // Fallback to CPU processing
    for (const auto& vote : votes) {
        engine->process_vote(vote);
    }
}

Troubleshooting

GPU Not Detected

# Check if MLX sees the GPU
python3 -c "import mlx.core as mx; print(mx.metal.is_available())"

# Should output: True

If False, ensure you're on Apple Silicon and macOS 13+.

Out of Memory Errors

Reduce batch size or enable quantization:

{
  "mlx_config": {
    "batch_size": 16,  // Reduce from 32
    "enable_quantization": true  // Use int8 instead of float32
  }
}

Performance Not Improving

Check batch size: Must be > 100 for significant speedup
Monitor GPU usage: Use Activity Monitor → GPU History
Disable debug mode: Remove MLX_DEBUG=1
Update MLX: pip3 install --upgrade mlx

Linker Errors

# Ensure MLX C++ library is installed
brew install mlx-cpp

# Add to CMakeLists.txt
find_package(MLX REQUIRED)
target_link_libraries(my_app PRIVATE MLX::MLX)

Examples

See working examples in the repository:

/examples/08-mlx-acceleration - MLX GPU integration demo
/benchmarks/mlx - Performance benchmarks
/config/examples/mlx_backend.json - Configuration template

System Requirements Summary

Component	Minimum	Recommended
Chip	M1	M3 Max
Memory	8 GB	16 GB
macOS	13.0	14.0+
Storage	100 MB	500 MB

Documentation

On this page