Я пытаюсь выполнить основные операции +
с CUDA для вычислений на GPU. Функция vectorIncreaseOne
это экземпляр для деталей операции и gpuIncreaseOne
функция — это структура для применения операции к каждому элементу параметра. data_for_calculation
.
Экспериментальная реализация
Экспериментальная реализация gpuIncreaseOne
функция как показано ниже.
#include <stdio.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <helper_cuda.h>
#include <math.h>
__global__ void CUDACalculation::vectorIncreaseOne(const long double* input, long double* output, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
if (input[i] < 255)
{
output[i] = input[i] + 1;
}
}
}
int CUDACalculation::gpuIncreaseOne(float* data_for_calculation, int size)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = size;
size_t DataSize = numElements * sizeof(float);
// Allocate the device input vector A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, DataSize);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector A (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input vector B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, DataSize);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector B (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device output vector C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, DataSize);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the host input vectors A and B in host memory to the device input vectors in
// device memory
err = cudaMemcpy(d_A, data_for_calculation, DataSize, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threadsn", blocksPerGrid, threadsPerBlock);
vectorIncreaseOne <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
err = cudaMemcpy(data_for_calculation, d_C, DataSize, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_B);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_C);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector C (error code %s)!n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
return 0;
}
Тестовые кейсы
Тестовый пример для gpuIncreaseOne
функция как показано ниже.
auto data_pointer = (float*)malloc(100 * sizeof(float));
for (int i = 0; i < 100; i++)
{
data_pointer[i] = static_cast<float>(1);
}
CUDACalculation::gpuIncreaseOne(data_pointer, 100);
free(data_pointer);
Все предложения приветствуются.
Если есть какие-либо улучшения по поводу:
- Возможный недостаток или ненужные накладные расходы
- Обработка ошибок
пожалуйста, дайте мне знать.