Run The First Kernel
First we allocate memmory space from host & device by malloc & cudamalloc:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
int main() {
int size{1'000'000};
// We create 3 pointers for host memory
// and allocate space from host memory
int* A, * B, * C;
A = (int*)malloc(size * sizeof(int));
B = (int*)malloc(size * sizeof(int));
C = (int*)malloc(size * sizeof(int));
// We create 3 pointers for device memory
// and allocate space from device global memory
int* d_A, * d_B, * d_C;
cudaMalloc((void**)&d_A, size* sizeof(int));
cudaMalloc((void**)&d_B, size* sizeof(int));
cudaMalloc((void**)&d_C, size* sizeof(int));
//create and assign random variable to A, B, and C
// copy memory: host -> device
// run kernel
// copy memory: device -> host
// Free the device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free the host memory
free(A);
free(B);
free(C);
}
Memcpy
- Copy data from host to device
- runs kernel function
- Copy data from device to host
// copy memory: host -> device
cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size * sizeof(int), cudaMemcpyHostToDevice);
// run kernel
// copy memory: device -> host
cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);
Kernel function
- Write a kernel function. The kernel function will run on GPU
__global__
void vecAddGPU(int* A, int* B, int* C, int size)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < size) C[i] = A[i] + B[i];
}
Call the kernel function:
// run kernel
int blockSize = 256; // Every block includes 256 threads
int blockCount = size / blockSize;
vecAddGPU << < blockCount, blockSize>> > (d_A, d_B, d_C, size);
Code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
// |>----------<>----------<>----------<>----------<>----------<|
// |> Kernel Function <|
// |>----------<>----------<>----------<>----------<>----------<|
// Compute vector sum C = A+B
// Each thread performs one pair-wise addition
__global__
void vecAddGPU(int* A, int* B, int* C, int size)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < size) C[i] = A[i] + B[i];
}
// |>----------<>----------<>----------<>----------<>----------<|
int main() {
int size{1'000'000};
// We create 3 pointers for host memory
// and allocate space from host memory
int* A, * B, * C;
A = (int*)malloc(size * sizeof(int));
B = (int*)malloc(size * sizeof(int));
C = (int*)malloc(size * sizeof(int));
// We create 3 pointers for device memory
// and allocate space from device global memory
int* d_A, * d_B, * d_C;
cudaMalloc((void**)&d_A, size* sizeof(int));
cudaMalloc((void**)&d_B, size* sizeof(int));
cudaMalloc((void**)&d_C, size* sizeof(int));
// copy memory: host -> device
cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size * sizeof(int), cudaMemcpyHostToDevice);
// run kernel
int blockSize = 256; // Every block includes 256 threads
int blockCount = size / blockSize;
vecAddGPU << < blockCount, blockSize>> > (d_A, d_B, d_C, size);
// copy memory: device -> host
cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);
// Free the device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free the host memory
free(A);
free(B);
free(C);
}
Verify
vecPrint
for verify result we can print n variable of each array:
void vecPrint(int* A, int n) {
for (int i = 0; i < n; i++) std::cout << A[i] << "-";
std::cout << std::endl;
}
// copy memory: device -> host
cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);
// print arrays
vecPrint(A, 10);
vecPrint(B, 10);
vecPrint(C, 10);
// Free the device memory
cudaFree(d_A);
If we print current values of current arrays we will sea a meaninless values. Because we does assing any values to them yet.
random values
We can use a random engine for filling arrays
#include <iostream>
#include <random>
// |>----------<>----------<>----------<>----------<>----------<|
// |> Random <|
// |>----------<>----------<>----------<>----------<>----------<|
//
// for more information
//https://stackoverflow.com/questions/7114043/random-number-generation-in-c11-how-to-generate-how-does-it-work
typedef std::mt19937 MyRandom; // the Mersenne Twister with a popular choice of parameters
uint32_t seed_val = 555; // populate somehow
MyRandom MR; // e.g. keep one global instance (per thread)
std::uniform_int_distribution<int32_t> int_dist(-10'000,10'000);
void initRandom(){
MR.seed(seed_val);
}
void fillRandom(int* A, int size) {
for(int i = 0; i < size; i++)
A[i] = int_dist(MR);
}
//create and assign random variable to A, B, and C
initRandom();
fillRandom(A, size);
fillRandom(B, size);
// copy memory: host -> device
cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
Verify
- Write same kernel code for CPU side
- compare CPU result by GPU result
Code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <random>
// |>----------<>----------<>----------<>----------<>----------<|
// |> Random <|
// |>----------<>----------<>----------<>----------<>----------<|
//
// for more information
//https://stackoverflow.com/questions/7114043/random-number-generation-in-c11-how-to-generate-how-does-it-work
typedef std::mt19937 MyRandom; // the Mersenne Twister with a popular choice of parameters
uint32_t seed_val = 555; // populate somehow
MyRandom MR; // e.g. keep one global instance (per thread)
std::uniform_int_distribution<int32_t> int_dist(-10'000,10'000);
void initRandom(){
MR.seed(seed_val);
}
void fillRandom(int* A, int size) {
for(int i = 0; i < size; i++)
A[i] = int_dist(MR);
}
// |>----------<>----------<>----------<>----------<>----------<|
// |> Kernel Function <|
// |>----------<>----------<>----------<>----------<>----------<|
// Compute vector sum C = A+B
// Each thread performs one pair-wise addition
__global__
void vecAddGPU(int* A, int* B, int* C, int size)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < size) C[i] = A[i] + B[i];
}
// |>----------<>----------<>----------<>----------<>----------<|
// |> Verify <|
// |>----------<>----------<>----------<>----------<>----------<|
void vecAddCPU(int* A, int* B, int* C, int size)
{
for(int i = 0; i < size; i++)
C[i] = A[i] + B[i];
}
void vecPrint(int* A, int n) {
for (int i = 0; i < n; i++) std::cout << A[i] << "-";
std::cout << std::endl;
}
void verify(int* A, int* B, int* C, int size) {
int * C_cpu = (int*)malloc(size * sizeof(int));
vecAddCPU(A, B, C_cpu, size);
for (int i = 0; i < size; i++) {
if (C[i] != C_cpu[i]) {
std::cout << "ERROR: the arrays are not the same" << std::endl;
std::cout << std::endl << std::endl;
std::cout << "i: " << i << std::endl;
std::cout << "A[i]: " << A[i] << std::endl;
std::cout << "B[i]: " << B[i] << std::endl;
std::cout << "C_GPU[i] " << C[i] << std::endl;
std::cout << "C_CPU[i] " << C_cpu[i] << std::endl;
free(C_cpu);
return;
}
}
std::cout << "SUCCESS: the arrays are the same" << std::endl;
free(C_cpu);
}
int main() {
int size{ 1'00'000 };
// We create 3 pointers for host memory
// and allocate space from host memory
int* A, * B, * C;
A = (int*)malloc(size * sizeof(int));
B = (int*)malloc(size * sizeof(int));
C = (int*)malloc(size * sizeof(int));
// We create 3 pointers for device memory
// and allocate space from device global memory
int* d_A, * d_B, * d_C;
cudaMalloc((void**)&d_A, size* sizeof(int));
cudaMalloc((void**)&d_B, size* sizeof(int));
cudaMalloc((void**)&d_C, size* sizeof(int));
//create and assign random variable to A, B, and C
initRandom();
fillRandom(A, size);
fillRandom(B, size);
// copy memory: host -> device
cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, size * sizeof(int), cudaMemcpyHostToDevice);
// run kernel
int blockSize = 256; // Every block includes 256 threads
int blockCount = size / blockSize +1;
vecAddGPU << < blockCount, blockSize>> > (d_A, d_B, d_C, size);
// copy memory: device -> host
cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);
// print arrays
std::cout << "Print first 10 elements:\n";
vecPrint(A, 10);
vecPrint(B, 10);
vecPrint(C, 10);
std::cout << std::endl;
//verify
verify(A,B,C,size);
// Free the device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free the host memory
free(A);
free(B);
free(C);
}
Result
Print first 10 elements:
4357--3949--9043--6704-8890-9847-3728--560-1624-7846-
6744-6097-7667-247-6183-2580-5667-9878--1766-1044-
11101-2148--1376--6457-15073-12427-9395-9318--142-8890-
SUCCESS: the arrays are the same
Leave a Reply