CUDA Conv

Table of Contents

CUDA Parallel Programming - This article is part of a series.

Part 1: GPU 结构

Part 2: CUDA Programming

Part 3: This Article

Part 4: CUDA Optimization

Part 5: CUDA Memory and Optimization

CUDA Conv
#

在 PyTorch 上实现Conv很简单

import torch
from torch.nn.functional import conv2d
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import time

width = 1000
height = 1000
img =torch.randn([width,height])
img = img.to(device)

kernel = torch.tensor([[-1.0, 0.0, 1.0],
                       [-1.0, 0.0, 1.0],
                       [-1.0, 0.0, 1.0]])

img = torch.reshape(img, (1, 1, width, height))
kernel = torch.reshape(kernel, (1, 1, 3, 3))
kernel = kernel.to(device)

start = time.perf_counter()
output = F.conv2d(img, kernel, stride=1).to(device)
end = time.perf_counter()

print(f'total_cost: {end-start} ms')
print(f'output_size: {output.shape}')
print(f'output_tensor: {output}')

用CUDA实现Conv，Steps:

Check
Get thread info
Memory allocation
Copy data to device
Call conv kernel
Copy data to host
Free memory

//file: conv.cu
#include<stdint.h>
#include<cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>

const int NUM_REPEATS = 10;

#define CHECK(call)                                   \
do                                                    \
{                                                     \
    const cudaError_t error_code = call;              \
    if (error_code != cudaSuccess)                    \
    {                                                 \
        printf("CUDA Error:\n");                      \
        printf("    File:       %s\n", __FILE__);     \
        printf("    Line:       %d\n", __LINE__);     \
        printf("    Error code: %d\n", error_code);   \
        printf("    Error text: %s\n",                \
            cudaGetErrorString(error_code));          \
        exit(1);                                      \
    }                                                 \
} while (0)

static void HandleError(cudaError_t err,
    const char* file,

    int line)
{
    if (err != cudaSuccess)
    {
        printf("%s in %s at line %d\n",
            cudaGetErrorString(err),
            file, line);
        exit(EXIT_FAILURE);
    }
}

#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))

int getThreadNum()
{
    cudaDeviceProp prop;
    int count;

    CHECK(cudaGetDeviceCount(&count));
    printf("GPU num: %d\n", count);
    CHECK(cudaGetDeviceProperties(&prop, 0));
    printf("Max thread num per block: %d\n", prop.maxThreadsPerBlock);
    printf("Max grid dim: %d, %d, %d)\n",
        prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
    return prop.maxThreadsPerBlock;
}

__global__ void conv(float* img, float* kernel, float* result,
    int width, int height, int kernelSize)
{
    int ti = threadIdx.x;
    int bi = blockIdx.x;
    int id = (bi * blockDim.x + ti);
    if (id >= width * height)
    {
        return;
    }
    int row = id / width;
    int col = id % width;
    for (int i = 0; i < kernelSize; ++i)
    {
        for (int j = 0; j < kernelSize; ++j)
        {
            float imgValue = 0;
            int curRow = row - kernelSize / 2 + i;
            int curCol = col - kernelSize / 2 + j;
            if (curRow < 0 || curCol < 0 || curRow >= height || curCol >= width)
            {
            }
            else
            {
                imgValue = img[curRow * width + curCol];
            }
            result[id] += kernel[i * kernelSize + j] * imgValue;
        }

    }
}

int main()
{
    int width = 1000;
    int height = 1000;
    float* img = new float[width * height];
    for (int row = 0; row < height; ++row)
    {
        for (int col = 0; col < width; ++col)
        {
            img[col + row * width] = (col + row) % 256;
        }
    }

    int kernelSize = 3;
    float* kernel = new float[kernelSize * kernelSize];
    for (int i = 0; i < kernelSize * kernelSize; ++i)
    {
        kernel[i] = i % kernelSize - 1;
    }

    float* imgGpu;
    float* kernelGpu;
    float* resultGpu;

    CHECK(cudaMalloc((void**)&imgGpu, width * height * sizeof(float)));
    CHECK(cudaMalloc((void**)&kernelGpu, kernelSize * kernelSize * sizeof(float)));
    CHECK(cudaMalloc((void**)&resultGpu, width * height * sizeof(float)));

    CHECK(cudaMemcpy(imgGpu, img,
        width * height * sizeof(float), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(kernelGpu, kernel,
        kernelSize * kernelSize * sizeof(float), cudaMemcpyHostToDevice));

    int threadNum = getThreadNum();
    int blockNum = (width * height - 0.5) / threadNum + 1;

    float t_sum = 0;
    float t2_sum = 0;
    for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
    {
        cudaEvent_t start, stop;
        CHECK(cudaEventCreate(&start));
        CHECK(cudaEventCreate(&stop));
        CHECK(cudaEventRecord(start));
        cudaEventQuery(start);

        conv << <blockNum, threadNum >> >
            (imgGpu, kernelGpu, resultGpu, width, height, kernelSize);

        CHECK(cudaEventRecord(stop));
        CHECK(cudaEventSynchronize(stop));
        float elapsed_time;
        CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
        printf("Time = %g ms.\n", elapsed_time);

        if (repeat > 0)
        {
            t_sum += elapsed_time;
            t2_sum += elapsed_time * elapsed_time;
        }

        CHECK(cudaEventDestroy(start));
        CHECK(cudaEventDestroy(stop));
    }

    const float t_ave = t_sum / NUM_REPEATS;
    const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
    printf("Time = %g +- %g ms.\n", t_ave, t_err);


    float* result = new float[width * height];
    CHECK(cudaMemcpy(result, resultGpu,
        width * height * sizeof(float), cudaMemcpyDeviceToHost));

    // visualization
    printf("img\n");
    for (int row = 0; row < 10; ++row)
    {
        for (int col = 0; col < 10; ++col)
        {
            printf("%2.0f ", img[col + row * width]);
        }
        printf("\n");
    }
    printf("kernel\n");
    for (int row = 0; row < kernelSize; ++row)
    {
        for (int col = 0; col < kernelSize; ++col)
        {
            printf("%2.0f ", kernel[col + row * kernelSize]);
        }
        printf("\n");
    }

    printf("result\n");
    for (int row = 0; row < 10; ++row)
    {
        for (int col = 0; col < 10; ++col)
        {
            printf("%2.0f ", result[col + row * width]);
        }
        printf("\n");
    }
    return 0;
}

CUDA Parallel Programming - This article is part of a series.

Part 1: GPU 结构

Part 2: CUDA Programming

Part 3: This Article

Part 4: CUDA Optimization

Part 5: CUDA Memory and Optimization

CUDA Conv #

CUDA Conv
#