// Launch kernel int threadsPerBlock = 256; int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
int main() int n = 1000000; size_t bytes = n * sizeof(float); cuda toolkit
.PHONY: all run clean | Operation | Function | |-----------|----------| | Allocate GPU memory | cudaMalloc(&ptr, size) | | Free GPU memory | cudaFree(ptr) | | Copy to GPU | cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice) | | Copy to CPU | cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost) | | Get GPU count | cudaGetDeviceCount(&count) | | Set active GPU | cudaSetDevice(device_id) | | Synchronize | cudaDeviceSynchronize() | | Error checking | cudaGetLastError() | Installation Check # Check CUDA version nvcc --version Check GPU driver & CUDA capability nvidia-smi Check available GPUs nvidia-smi -L This gives you a working starting point. Need a specific CUDA library example (cuBLAS for matrix multiplication, cuFFT for FFTs, or multi-GPU programming)? // Launch kernel int threadsPerBlock = 256; int