CUDA Vector Addition

Run The First Kernel

First we allocate memmory space from host & device by malloc & cudamalloc:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

int main() {
	int size{1'000'000};
	
	// We create 3 pointers for host memory
	// and allocate space from host memory 
	int* A, * B, * C; 
	A = (int*)malloc(size * sizeof(int));
	B = (int*)malloc(size * sizeof(int));
	C = (int*)malloc(size * sizeof(int));

	// We create 3 pointers for device memory 
	// and allocate space from device global memory 
	int* d_A, * d_B, * d_C; 
	cudaMalloc((void**)&d_A, size* sizeof(int));
	cudaMalloc((void**)&d_B, size* sizeof(int));
	cudaMalloc((void**)&d_C, size* sizeof(int));

	//create and assign random variable to A, B, and C

        // copy memory: host -> device
	// run kernel 
	// copy memory: device -> host

	// Free the device memory 
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	// Free the host memory 
	free(A);
	free(B);
	free(C);
}

Memcpy

  • Copy data from host to device
  • runs kernel function
  • Copy data from device to host
	// copy memory: host -> device
	cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, B, size * sizeof(int), cudaMemcpyHostToDevice);

	// run kernel 

	// copy memory: device -> host
	cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);

Kernel function

  • Write a kernel function. The kernel function will run on GPU
__global__
void vecAddGPU(int* A, int* B, int* C, int size)
{
	int i = threadIdx.x + blockDim.x * blockIdx.x;
	if (i < size) C[i] = A[i] + B[i];
}

Call the kernel function:

	// run kernel 
	int blockSize = 256; // Every block includes 256 threads 
	int blockCount = size / blockSize;
	vecAddGPU << < blockCount, blockSize>> > (d_A, d_B, d_C, size);

Code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>

// |>----------<>----------<>----------<>----------<>----------<|
// |>                     Kernel Function                      <|
// |>----------<>----------<>----------<>----------<>----------<|


// Compute vector sum C = A+B
// Each thread performs one pair-wise addition
__global__
void vecAddGPU(int* A, int* B, int* C, int size)
{
	int i = threadIdx.x + blockDim.x * blockIdx.x;
	if (i < size) C[i] = A[i] + B[i];
}


// |>----------<>----------<>----------<>----------<>----------<|

int main() {
	int size{1'000'000};
	
	// We create 3 pointers for host memory
	// and allocate space from host memory 
	int* A, * B, * C; 
	A = (int*)malloc(size * sizeof(int));
	B = (int*)malloc(size * sizeof(int));
	C = (int*)malloc(size * sizeof(int));

	// We create 3 pointers for device memory 
	// and allocate space from device global memory 
	int* d_A, * d_B, * d_C; 
	cudaMalloc((void**)&d_A, size* sizeof(int));
	cudaMalloc((void**)&d_B, size* sizeof(int));
	cudaMalloc((void**)&d_C, size* sizeof(int));

	// copy memory: host -> device
	cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, B, size * sizeof(int), cudaMemcpyHostToDevice);

	// run kernel 
	int blockSize = 256; // Every block includes 256 threads 
	int blockCount = size / blockSize;
	vecAddGPU << < blockCount, blockSize>> > (d_A, d_B, d_C, size);

	// copy memory: device -> host
	cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);

	// Free the device memory 
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	// Free the host memory 
	free(A);
	free(B);
	free(C);
}

Verify

vecPrint

for verify result we can print n variable of each array:


void vecPrint(int* A, int n) {
	for (int i = 0; i < n; i++) std::cout << A[i] << "-";
	std::cout << std::endl;
}
	// copy memory: device -> host
	cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);

	// print arrays
	vecPrint(A, 10);
	vecPrint(B, 10);
	vecPrint(C, 10);

	// Free the device memory 
	cudaFree(d_A);

If we print current values of current arrays we will sea a meaninless values. Because we does assing any values to them yet.

random values

We can use a random engine for filling arrays

#include <iostream>
#include <random>

// |>----------<>----------<>----------<>----------<>----------<|
// |>                          Random                          <|
// |>----------<>----------<>----------<>----------<>----------<|
// 
// for more information
//https://stackoverflow.com/questions/7114043/random-number-generation-in-c11-how-to-generate-how-does-it-work

typedef std::mt19937 MyRandom;  // the Mersenne Twister with a popular choice of parameters
uint32_t seed_val = 555;           // populate somehow

MyRandom MR;                   // e.g. keep one global instance (per thread)
std::uniform_int_distribution<int32_t> int_dist(-10'000,10'000);


void initRandom(){
	MR.seed(seed_val);
}
void fillRandom(int* A, int size) {
	for(int i = 0; i < size; i++)
		A[i] = int_dist(MR);
}
	//create and assign random variable to A, B, and C
	initRandom();
	fillRandom(A, size);
	fillRandom(B, size);

	// copy memory: host -> device
	cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);

Verify

  • Write same kernel code for CPU side
  • compare CPU result by GPU result

Code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <random>

// |>----------<>----------<>----------<>----------<>----------<|
// |>                          Random                          <|
// |>----------<>----------<>----------<>----------<>----------<|
// 
// for more information
//https://stackoverflow.com/questions/7114043/random-number-generation-in-c11-how-to-generate-how-does-it-work

typedef std::mt19937 MyRandom;  // the Mersenne Twister with a popular choice of parameters
uint32_t seed_val = 555;           // populate somehow

MyRandom MR;                   // e.g. keep one global instance (per thread)
std::uniform_int_distribution<int32_t> int_dist(-10'000,10'000);


void initRandom(){
	MR.seed(seed_val);
}
void fillRandom(int* A, int size) {
	for(int i = 0; i < size; i++)
		A[i] = int_dist(MR);
}


// |>----------<>----------<>----------<>----------<>----------<|
// |>                     Kernel Function                      <|
// |>----------<>----------<>----------<>----------<>----------<|


// Compute vector sum C = A+B
// Each thread performs one pair-wise addition
__global__
void vecAddGPU(int* A, int* B, int* C, int size)
{
	int i = threadIdx.x + blockDim.x * blockIdx.x;
	if (i < size) C[i] = A[i] + B[i];
}


// |>----------<>----------<>----------<>----------<>----------<|
// |>                          Verify                          <|
// |>----------<>----------<>----------<>----------<>----------<|


void vecAddCPU(int* A, int* B, int* C, int size)
{
	for(int i = 0; i < size; i++)
		C[i] = A[i] + B[i];
}

void vecPrint(int* A, int n) {
	for (int i = 0; i < n; i++) std::cout << A[i] << "-";
	std::cout << std::endl;
}

void verify(int* A, int* B, int* C, int size) {
	int * C_cpu = (int*)malloc(size * sizeof(int));
	vecAddCPU(A, B, C_cpu, size);

	for (int i = 0; i < size; i++) {
		if (C[i] != C_cpu[i]) {
			std::cout << "ERROR: the arrays are not the same" << std::endl;
			std::cout << std::endl << std::endl;

			std::cout << "i: " << i << std::endl;
			std::cout << "A[i]: " << A[i] << std::endl;
			std::cout << "B[i]: " << B[i] << std::endl;
			std::cout << "C_GPU[i] " << C[i] << std::endl;
			std::cout << "C_CPU[i] " << C_cpu[i] << std::endl;

			free(C_cpu);
			return;
		}
	}


	std::cout << "SUCCESS: the arrays are the same" << std::endl;

	free(C_cpu);

}


int main() {
	int size{ 1'00'000 };
	
	// We create 3 pointers for host memory
	// and allocate space from host memory 
	int* A, * B, * C; 
	A = (int*)malloc(size * sizeof(int));
	B = (int*)malloc(size * sizeof(int));
	C = (int*)malloc(size * sizeof(int));

	// We create 3 pointers for device memory 
	// and allocate space from device global memory 
	int* d_A, * d_B, * d_C; 
	cudaMalloc((void**)&d_A, size* sizeof(int));
	cudaMalloc((void**)&d_B, size* sizeof(int));
	cudaMalloc((void**)&d_C, size* sizeof(int));

	//create and assign random variable to A, B, and C
	initRandom();
	fillRandom(A, size);
	fillRandom(B, size);

	// copy memory: host -> device
	cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, B, size * sizeof(int), cudaMemcpyHostToDevice);

	// run kernel 
	int blockSize = 256; // Every block includes 256 threads 
	int blockCount = size / blockSize +1;
	vecAddGPU << < blockCount, blockSize>> > (d_A, d_B, d_C, size);

	// copy memory: device -> host
	cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);

	// print arrays
	std::cout << "Print first 10 elements:\n";
	vecPrint(A, 10);
	vecPrint(B, 10);
	vecPrint(C, 10);
	std::cout << std::endl;

	//verify
	verify(A,B,C,size);

	// Free the device memory 
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	// Free the host memory 
	free(A);
	free(B);
	free(C);
}

Result

Print first 10 elements:
4357--3949--9043--6704-8890-9847-3728--560-1624-7846-
6744-6097-7667-247-6183-2580-5667-9878--1766-1044-
11101-2148--1376--6457-15073-12427-9395-9318--142-8890-

SUCCESS: the arrays are the same

Comments

Leave a Reply

Your email address will not be published. Required fields are marked *