GPU Compute API Reference¶
Maturity: Partial (Experimental) — The GPU Compute module (
jaguar_gpu) provides a hardware-abstraction layer supporting CUDA, OpenCL, Metal, and Vulkan backends. The CPU backend is functional and tested. GPU backends (CUDA, Metal, OpenCL) build correctly when enabled but are not wired into the engine tick loop and are not tested in CI. GPU acceleration of the physics pipeline is a roadmap item.
Overview¶
The GPU module provides: - Backend Abstraction: Single API for multiple GPU backends (CPU always available; GPU backends off by default) - Kernel Management: Launch and manage compute kernels - Memory Management: Efficient buffer allocation and transfer (memory-pool deadlocks fixed in v0.7.0) - Device Queries: Capability detection and device selection
Compute Backend¶
Header¶
Backend Types¶
namespace jaguar::gpu {
// Supported compute backends
enum class BackendType {
CPU, // Fallback CPU implementation
CUDA, // NVIDIA CUDA
OpenCL, // OpenCL (cross-platform)
Metal, // Apple Metal
Vulkan // Vulkan Compute
};
// Device types
enum class DeviceType {
Unknown,
CPU,
GPU,
Accelerator,
Custom
};
// Memory types
enum class MemoryType {
DeviceLocal, // GPU memory only
HostVisible, // CPU-accessible GPU memory
HostCached, // Cached host memory
Shared // Unified memory (CPU + GPU)
};
// Memory access patterns
enum class MemoryAccess {
ReadOnly,
WriteOnly,
ReadWrite
};
// Kernel argument types
enum class ArgType {
Buffer,
Scalar,
LocalMemory,
Image,
Sampler
};
} // namespace jaguar::gpu
Handle Types¶
namespace jaguar::gpu {
// Buffer handle (opaque reference to GPU memory)
struct BufferHandle {
uint64_t id = 0;
bool is_valid() const { return id != 0; }
bool operator==(const BufferHandle& other) const { return id == other.id; }
};
// Stream/queue handle for async operations
struct StreamHandle {
uint64_t id = 0;
bool is_valid() const { return id != 0; }
};
// Event handle for synchronization
struct EventHandle {
uint64_t id = 0;
bool is_valid() const { return id != 0; }
};
// Kernel handle
struct KernelHandle {
uint64_t id = 0;
bool is_valid() const { return id != 0; }
};
} // namespace jaguar::gpu
Result Types¶
namespace jaguar::gpu {
// Backend operation result
struct BackendResult {
bool success;
std::string error_message;
int error_code = 0;
};
// Device capabilities
struct DeviceCapabilities {
std::string device_name;
std::string vendor;
std::string driver_version;
DeviceType device_type;
// Compute capabilities
uint32_t compute_units;
uint32_t max_work_group_size;
std::array<uint32_t, 3> max_work_item_sizes;
uint32_t max_work_item_dimensions;
uint32_t warp_size; // CUDA warp / AMD wavefront
// Memory capabilities
uint64_t global_memory_size;
uint64_t local_memory_size; // Shared memory per block
uint64_t constant_memory_size;
uint64_t max_buffer_size;
uint32_t memory_bus_width;
uint64_t memory_bandwidth; // bytes/sec
// Features
bool supports_double_precision;
bool supports_half_precision;
bool supports_atomics;
bool supports_images;
bool supports_unified_memory;
// Clock speeds (MHz)
uint32_t clock_frequency;
uint32_t memory_clock_frequency;
};
// Launch configuration
struct LaunchConfig {
std::array<uint32_t, 3> global_size = {1, 1, 1}; // Total work items
std::array<uint32_t, 3> local_size = {1, 1, 1}; // Work group size
uint32_t shared_memory_size = 0; // Dynamic shared memory
StreamHandle stream; // Execution stream
// Helper constructors
static LaunchConfig linear(uint32_t total_items, uint32_t group_size = 256);
static LaunchConfig grid_2d(uint32_t width, uint32_t height,
uint32_t block_x = 16, uint32_t block_y = 16);
static LaunchConfig grid_3d(uint32_t x, uint32_t y, uint32_t z,
uint32_t block_x = 8, uint32_t block_y = 8,
uint32_t block_z = 8);
};
// Kernel argument
struct KernelArg {
ArgType type;
BufferHandle buffer; // For buffer arguments
const void* scalar_ptr; // For scalar arguments
size_t scalar_size;
size_t local_memory_size; // For local memory arguments
// Factory methods
static KernelArg from_buffer(BufferHandle buffer);
static KernelArg from_scalar(const void* ptr, size_t size);
template<typename T>
static KernelArg from_value(const T& value);
static KernelArg local_memory(size_t size);
};
} // namespace jaguar::gpu
IComputeBackend Interface¶
namespace jaguar::gpu {
class IComputeBackend {
public:
virtual ~IComputeBackend() = default;
// Lifecycle
virtual BackendResult initialize() = 0;
virtual void shutdown() = 0;
virtual BackendType get_type() const = 0;
// Device management
virtual size_t get_device_count() const = 0;
virtual DeviceCapabilities get_device_capabilities(size_t device_index = 0) const = 0;
virtual BackendResult select_device(size_t device_index) = 0;
virtual size_t get_selected_device() const = 0;
// Buffer management
virtual BufferHandle create_buffer(size_t size, MemoryType type = MemoryType::DeviceLocal) = 0;
virtual BufferHandle create_buffer(size_t size, const void* initial_data,
MemoryType type = MemoryType::DeviceLocal) = 0;
virtual void destroy_buffer(BufferHandle buffer) = 0;
virtual size_t get_buffer_size(BufferHandle buffer) const = 0;
// Memory transfers
virtual BackendResult copy_to_device(BufferHandle dst, const void* src, size_t size) = 0;
virtual BackendResult copy_to_device_async(BufferHandle dst, const void* src, size_t size,
StreamHandle stream) = 0;
virtual BackendResult copy_from_device(void* dst, BufferHandle src, size_t size) = 0;
virtual BackendResult copy_from_device_async(void* dst, BufferHandle src, size_t size,
StreamHandle stream) = 0;
virtual BackendResult copy_device_to_device(BufferHandle dst, BufferHandle src, size_t size) = 0;
// Memory mapping (for HostVisible buffers)
virtual void* map_buffer(BufferHandle buffer, MemoryAccess access) = 0;
virtual void unmap_buffer(BufferHandle buffer) = 0;
// Kernel management
virtual KernelHandle load_kernel(const std::string& source,
const std::string& kernel_name,
const std::string& compile_options = "") = 0;
virtual KernelHandle load_kernel_from_binary(const std::vector<uint8_t>& binary,
const std::string& kernel_name) = 0;
virtual void destroy_kernel(KernelHandle kernel) = 0;
// Kernel execution
virtual BackendResult launch_kernel(KernelHandle kernel,
const LaunchConfig& config,
const std::vector<KernelArg>& args) = 0;
// Stream management
virtual StreamHandle create_stream() = 0;
virtual void destroy_stream(StreamHandle stream) = 0;
virtual BackendResult synchronize_stream(StreamHandle stream) = 0;
virtual BackendResult synchronize_device() = 0;
// Event management
virtual EventHandle create_event() = 0;
virtual void destroy_event(EventHandle event) = 0;
virtual BackendResult record_event(EventHandle event, StreamHandle stream) = 0;
virtual BackendResult wait_for_event(EventHandle event, StreamHandle stream) = 0;
virtual bool is_event_complete(EventHandle event) = 0;
virtual float get_elapsed_time(EventHandle start, EventHandle end) = 0; // milliseconds
// Utilities
virtual size_t get_available_memory() const = 0;
virtual size_t get_total_memory() const = 0;
};
} // namespace jaguar::gpu
IKernel Interface¶
namespace jaguar::gpu {
// Higher-level kernel wrapper
class IKernel {
public:
virtual ~IKernel() = default;
// Get kernel name
virtual std::string get_name() const = 0;
// Get argument information
virtual size_t get_num_arguments() const = 0;
virtual ArgType get_argument_type(size_t index) const = 0;
// Set arguments
virtual void set_argument(size_t index, BufferHandle buffer) = 0;
virtual void set_argument(size_t index, const void* data, size_t size) = 0;
template<typename T>
void set_argument(size_t index, const T& value) {
set_argument(index, &value, sizeof(T));
}
virtual void set_local_memory_argument(size_t index, size_t size) = 0;
// Launch
virtual BackendResult launch(const LaunchConfig& config) = 0;
// Get preferred work group size
virtual size_t get_preferred_work_group_size() const = 0;
// Get local memory usage
virtual size_t get_local_memory_usage() const = 0;
};
} // namespace jaguar::gpu
Backend Factory¶
namespace jaguar::gpu {
class BackendFactory {
public:
// Create specific backend
static std::unique_ptr<IComputeBackend> create(BackendType type);
// Create best available backend (prioritizes GPU over CPU)
static std::unique_ptr<IComputeBackend> create_best_available();
// Query available backends
static std::vector<BackendType> get_available_backends();
// Check if specific backend is available
static bool is_backend_available(BackendType type);
};
} // namespace jaguar::gpu
Usage Examples¶
Basic Buffer Operations¶
#include <jaguar/gpu/compute_backend.h>
using namespace jaguar::gpu;
// Create best available backend
auto backend = BackendFactory::create_best_available();
backend->initialize();
// Print device info
auto caps = backend->get_device_capabilities();
std::cout << "Device: " << caps.device_name << "\n";
std::cout << "Memory: " << caps.global_memory_size / (1024*1024) << " MB\n";
std::cout << "Compute units: " << caps.compute_units << "\n";
// Create buffers
const size_t N = 1024 * 1024;
std::vector<float> host_input(N, 1.0f);
std::vector<float> host_output(N);
auto input_buffer = backend->create_buffer(N * sizeof(float), host_input.data());
auto output_buffer = backend->create_buffer(N * sizeof(float));
// ... compute ...
// Read results
backend->copy_from_device(host_output.data(), output_buffer, N * sizeof(float));
// Cleanup
backend->destroy_buffer(input_buffer);
backend->destroy_buffer(output_buffer);
backend->shutdown();
Kernel Compilation and Launch¶
#include <jaguar/gpu/compute_backend.h>
using namespace jaguar::gpu;
auto backend = BackendFactory::create(BackendType::CUDA);
backend->initialize();
// Kernel source (CUDA)
const char* kernel_source = R"(
extern "C" __global__ void vector_add(
const float* a,
const float* b,
float* c,
int n
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
)";
// Compile kernel
auto kernel = backend->load_kernel(kernel_source, "vector_add", "-O3");
// Create buffers
const int N = 1000000;
std::vector<float> a(N, 1.0f), b(N, 2.0f), c(N);
auto buf_a = backend->create_buffer(N * sizeof(float), a.data());
auto buf_b = backend->create_buffer(N * sizeof(float), b.data());
auto buf_c = backend->create_buffer(N * sizeof(float));
// Set kernel arguments
std::vector<KernelArg> args = {
KernelArg::from_buffer(buf_a),
KernelArg::from_buffer(buf_b),
KernelArg::from_buffer(buf_c),
KernelArg::from_value(N)
};
// Launch configuration
auto config = LaunchConfig::linear(N, 256);
// Launch kernel
auto result = backend->launch_kernel(kernel, config, args);
if (!result.success) {
std::cerr << "Kernel launch failed: " << result.error_message << "\n";
}
// Synchronize and read results
backend->synchronize_device();
backend->copy_from_device(c.data(), buf_c, N * sizeof(float));
// Verify
for (int i = 0; i < 10; ++i) {
std::cout << "c[" << i << "] = " << c[i] << "\n"; // Should be 3.0
}
// Cleanup
backend->destroy_buffer(buf_a);
backend->destroy_buffer(buf_b);
backend->destroy_buffer(buf_c);
backend->destroy_kernel(kernel);
backend->shutdown();
Async Operations with Streams¶
#include <jaguar/gpu/compute_backend.h>
using namespace jaguar::gpu;
auto backend = BackendFactory::create_best_available();
backend->initialize();
// Create multiple streams for concurrent operations
auto stream1 = backend->create_stream();
auto stream2 = backend->create_stream();
// Create events for timing
auto start_event = backend->create_event();
auto end_event = backend->create_event();
const size_t N = 10000000;
std::vector<float> host_a(N), host_b(N);
auto buf_a = backend->create_buffer(N * sizeof(float), MemoryType::DeviceLocal);
auto buf_b = backend->create_buffer(N * sizeof(float), MemoryType::DeviceLocal);
// Record start
backend->record_event(start_event, stream1);
// Async copy on stream1
backend->copy_to_device_async(buf_a, host_a.data(), N * sizeof(float), stream1);
// Async copy on stream2 (runs concurrently)
backend->copy_to_device_async(buf_b, host_b.data(), N * sizeof(float), stream2);
// Launch kernels (after respective copies complete)
// ... kernel launches ...
// Record end
backend->record_event(end_event, stream1);
// Synchronize
backend->synchronize_stream(stream1);
backend->synchronize_stream(stream2);
// Get elapsed time
float elapsed_ms = backend->get_elapsed_time(start_event, end_event);
std::cout << "Elapsed time: " << elapsed_ms << " ms\n";
// Cleanup
backend->destroy_event(start_event);
backend->destroy_event(end_event);
backend->destroy_stream(stream1);
backend->destroy_stream(stream2);
backend->destroy_buffer(buf_a);
backend->destroy_buffer(buf_b);
backend->shutdown();
Cross-Platform Kernel¶
#include <jaguar/gpu/compute_backend.h>
using namespace jaguar::gpu;
// OpenCL kernel (portable)
const char* opencl_kernel = R"(
__kernel void physics_update(
__global float4* positions,
__global float4* velocities,
__global const float4* forces,
__global const float* masses,
float dt,
int num_entities
) {
int gid = get_global_id(0);
if (gid >= num_entities) return;
float mass = masses[gid];
float4 acceleration = forces[gid] / mass;
velocities[gid] += acceleration * dt;
positions[gid] += velocities[gid] * dt;
}
)";
// CUDA kernel (NVIDIA-optimized)
const char* cuda_kernel = R"(
extern "C" __global__ void physics_update(
float4* positions,
float4* velocities,
const float4* forces,
const float* masses,
float dt,
int num_entities
) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid >= num_entities) return;
float mass = masses[gid];
float4 acceleration = make_float4(
forces[gid].x / mass,
forces[gid].y / mass,
forces[gid].z / mass,
0.0f
);
velocities[gid].x += acceleration.x * dt;
velocities[gid].y += acceleration.y * dt;
velocities[gid].z += acceleration.z * dt;
positions[gid].x += velocities[gid].x * dt;
positions[gid].y += velocities[gid].y * dt;
positions[gid].z += velocities[gid].z * dt;
}
)";
// Select appropriate kernel based on backend
auto backend = BackendFactory::create_best_available();
backend->initialize();
const char* kernel_source = nullptr;
switch (backend->get_type()) {
case BackendType::CUDA:
kernel_source = cuda_kernel;
break;
case BackendType::OpenCL:
case BackendType::CPU:
default:
kernel_source = opencl_kernel;
break;
}
auto kernel = backend->load_kernel(kernel_source, "physics_update");
// Use kernel for physics simulation
// ...
Memory-Mapped I/O¶
#include <jaguar/gpu/compute_backend.h>
using namespace jaguar::gpu;
auto backend = BackendFactory::create_best_available();
backend->initialize();
// Create host-visible buffer
const size_t N = 1024;
auto buffer = backend->create_buffer(
N * sizeof(float),
MemoryType::HostVisible
);
// Map buffer for CPU access
float* mapped = static_cast<float*>(
backend->map_buffer(buffer, MemoryAccess::WriteOnly)
);
// Write directly to GPU memory
for (size_t i = 0; i < N; ++i) {
mapped[i] = static_cast<float>(i);
}
// Unmap before GPU use
backend->unmap_buffer(buffer);
// ... use buffer in GPU kernel ...
// Map again to read results
mapped = static_cast<float*>(
backend->map_buffer(buffer, MemoryAccess::ReadOnly)
);
for (size_t i = 0; i < 10; ++i) {
std::cout << "buffer[" << i << "] = " << mapped[i] << "\n";
}
backend->unmap_buffer(buffer);
backend->destroy_buffer(buffer);
backend->shutdown();
Physics Simulation on GPU¶
#include <jaguar/gpu/compute_backend.h>
#include <jaguar/jaguar.h>
using namespace jaguar;
using namespace jaguar::gpu;
class GPUPhysicsEngine {
std::unique_ptr<IComputeBackend> backend_;
KernelHandle integration_kernel_;
KernelHandle force_kernel_;
BufferHandle positions_;
BufferHandle velocities_;
BufferHandle forces_;
BufferHandle masses_;
size_t num_entities_ = 0;
public:
void initialize(size_t max_entities) {
backend_ = BackendFactory::create_best_available();
backend_->initialize();
// Allocate buffers for max entities
size_t vec4_size = max_entities * 4 * sizeof(float);
size_t scalar_size = max_entities * sizeof(float);
positions_ = backend_->create_buffer(vec4_size);
velocities_ = backend_->create_buffer(vec4_size);
forces_ = backend_->create_buffer(vec4_size);
masses_ = backend_->create_buffer(scalar_size);
// Load kernels
integration_kernel_ = backend_->load_kernel(
integration_kernel_source, "integrate");
force_kernel_ = backend_->load_kernel(
force_kernel_source, "compute_forces");
}
void update_entities(const std::vector<physics::EntityState>& states) {
num_entities_ = states.size();
// Pack data
std::vector<float> pos_data(num_entities_ * 4);
std::vector<float> vel_data(num_entities_ * 4);
std::vector<float> mass_data(num_entities_);
for (size_t i = 0; i < num_entities_; ++i) {
pos_data[i*4 + 0] = states[i].position.x;
pos_data[i*4 + 1] = states[i].position.y;
pos_data[i*4 + 2] = states[i].position.z;
pos_data[i*4 + 3] = 0.0f;
vel_data[i*4 + 0] = states[i].velocity.x;
vel_data[i*4 + 1] = states[i].velocity.y;
vel_data[i*4 + 2] = states[i].velocity.z;
vel_data[i*4 + 3] = 0.0f;
mass_data[i] = states[i].mass;
}
// Upload to GPU
backend_->copy_to_device(positions_, pos_data.data(),
pos_data.size() * sizeof(float));
backend_->copy_to_device(velocities_, vel_data.data(),
vel_data.size() * sizeof(float));
backend_->copy_to_device(masses_, mass_data.data(),
mass_data.size() * sizeof(float));
}
void step(float dt) {
auto config = LaunchConfig::linear(num_entities_, 256);
// Compute forces
std::vector<KernelArg> force_args = {
KernelArg::from_buffer(positions_),
KernelArg::from_buffer(velocities_),
KernelArg::from_buffer(forces_),
KernelArg::from_buffer(masses_),
KernelArg::from_value(static_cast<int>(num_entities_))
};
backend_->launch_kernel(force_kernel_, config, force_args);
// Integrate
std::vector<KernelArg> int_args = {
KernelArg::from_buffer(positions_),
KernelArg::from_buffer(velocities_),
KernelArg::from_buffer(forces_),
KernelArg::from_buffer(masses_),
KernelArg::from_value(dt),
KernelArg::from_value(static_cast<int>(num_entities_))
};
backend_->launch_kernel(integration_kernel_, config, int_args);
backend_->synchronize_device();
}
void read_results(std::vector<physics::EntityState>& states) {
std::vector<float> pos_data(num_entities_ * 4);
std::vector<float> vel_data(num_entities_ * 4);
backend_->copy_from_device(pos_data.data(), positions_,
pos_data.size() * sizeof(float));
backend_->copy_from_device(vel_data.data(), velocities_,
vel_data.size() * sizeof(float));
for (size_t i = 0; i < num_entities_; ++i) {
states[i].position.x = pos_data[i*4 + 0];
states[i].position.y = pos_data[i*4 + 1];
states[i].position.z = pos_data[i*4 + 2];
states[i].velocity.x = vel_data[i*4 + 0];
states[i].velocity.y = vel_data[i*4 + 1];
states[i].velocity.z = vel_data[i*4 + 2];
}
}
void shutdown() {
backend_->destroy_kernel(integration_kernel_);
backend_->destroy_kernel(force_kernel_);
backend_->destroy_buffer(positions_);
backend_->destroy_buffer(velocities_);
backend_->destroy_buffer(forces_);
backend_->destroy_buffer(masses_);
backend_->shutdown();
}
};
See Also¶
- Architecture - System architecture overview
- Machine Learning API - GPU-accelerated ML inference
- Configuration - Engine configuration