CUDA C++ 实现图像处理

最近课程里面有用到 NVIDIA CUDA 框架进行并行编程，实现了一些非常基本的图像处理的操作。使用 CUDA 实现的并行加速能够极大的提升图像处理的效率，这也是为什么近几年的深度学习框架都要依托于 CUDA 进行计算加速。CUDA 本质上是 C/C++ 的拓展，因此对 C/C++ 熟悉的话上手也会很快。

读取保存图像

C++ 中读取图像和保存图像不像 Python 那样方便简单。一开始我想用 OpenCV 来读取/写入图像，但对于我写一个小的示例程序来说可谓麻烦了些，涉及到编译的问题。后来看到了 lodepng 这个小巧方便的库，支持 C 和 C++，使用时只需要引用一个头文件就可以。读取和保存文件只需要如下的几行代码：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
#include "lodepng.h"

// Read image
std::vector<unsigned char> image;
unsigned int width, height;

unsigned error = lodepng::decode(image, width, height, input_file);

// Save image
std::vector<unsigned char>  out_image(image.size(), 255);
error = lodepng::encode(output_file, out_image, width, height);

加载好的图片像素有 4 个通道，按照 RGBARGBA... 的顺序在一维数组中排列的，其中 A 表示的是透明度，0 表示全透明，255 则完全显示当前像素。

CUDA Kernel

CUDA 并行部分的代码需要单独用 __global__ 关键词进行修饰，这一部分的代码会在 device 上执行。NVIDIA 自家的博文 An Even Easier Introduction to CUDA 就通俗易懂的介绍了 CUDA 的一些基本概念。CUDA C++ Programming Guide 有更为全面和详细的例子与参考。

下面是我实现的 RGB 转 Grayscale 的 Kernel 并行代码：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
// CUDA Kernel
__global__ void RGB2GrayKernel(unsigned char* input_image, 
                               unsigned char* output_image,
                               int width, int height) {
    
    // input_image size: width*height*Channels
    // output_image size: width*height

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if ((row < height) && (col < width)) {
        // Offset in Grayscale image
        int offset = row*width + col;
        
        // Get RGB values
        unsigned char r = input_image[offset * CHANNELS + 0];
        unsigned char g = input_image[offset * CHANNELS + 1];
        unsigned char b = input_image[offset * CHANNELS + 2];

        // Convert to grayscale
        output_image[offset] = 0.2126f*r + 0.7152f*g + 0.0722f*b;
    }
}

通过 CUDA Kernel 全局变量 blockDim、blockIdx 以及 threadIdx，可以得到当前运行 Kernel 的线程的编号，这样能够方便的访问部分数据。

在 CPU(host) 部分的 main 函数中，我们需要将图片数据拷贝到 device 上的 global memory 中，需要用到 cudaMalloc 以及 cudaMemcpy：

1
2
3
4
5
6
7
8
// Allocate memory for CUDA device
size_t mem_size = width * height * sizeof(unsigned char);
unsigned char* dev_input, *dev_output;

cudaMalloc(&dev_input, mem_size * CHANNELS);
cudaMemcpy(dev_input, image.data(), mem_size * CHANNELS, cudaMemcpyHostToDevice);

cudaMalloc(&dev_output, mem_size);

在 main 中调用 CUDA Kernel 的时候，需要用到尖括号的表达。下面在一个 Block 里我用了 16x16 个线程，然后对于 width x height 的图像，可以得到所需的 Grid 的 Size，使得整个图片能被所有的线程都处理到。

1
2
3
4
5
6
7
#define BLOCK_SIZE 16

// Invoke CUDA kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);

RGB2GrayKernel<<<dimGrid, dimBlock>>>(dev_input, dev_output, width, height);

在调用完 Kernel 之后，我们需要将 CUDA device 内存中的数据拷贝到 CPU 上 main 程序里，这样可以在后续使用 lodepng 将图片保存。最后不能忘记用 cudaFree 将分配的 GPU 显存给释放。

1
2
3
4
5
6
// Copy output from device
unsigned char* image_y = new unsigned char[width*height];
cudaMemcpy(image_y, dev_output, mem_size, cudaMemcpyDeviceToHost);

cudaFree(dev_input);
cudaFree(dev_output);

Timing

使用 CUDA C++ 的接口可以方便的计算运行时间，无论是否是 CUDA Kernel 代码：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
#include "cuda_runtime.h"

// Timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start);

//
// DO SOMETHING
// WITH OR WITHOUT CUDA
//

// Get elapsed time in ms
cudaEventRecord(stop);

cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);

编译

编译需要用到 CUDA 库中的 NVIDIA CUDA Compiler (NVCC)，在编译的时候注意 GPU 的架构，需要指定 -arch=compute_35 参数，具体的列表可以在 GPU Feature List 里查看。下面我写了一个非常简单的 makefile 文件，写了一个 rgb2gray_cuda 的规则来使用系统中的 CUDA 10 下的 nvcc 来编译我的 rgb2gray.cu 源码。

1
2
3
4
5
6
7
8
NVCC ?= /usr/local/cuda-10.1/bin/nvcc
NVCCFLAGS ?= -arch=compute_35

rgb2gray_cuda:
	$(NVCC) $(NVCCFLAGS) -o rgb2gray_cuda.out rgb2gray.cu lodepng.cpp

clean:
	rm -f *.out

编译时只需要执行 $ make rgb2gray_cuda，即可生成 rgb2gray_cuda.out 可执行文件。可以通过 $ make clean 清除所有编译产生的 .out 文件。

Histogram 与 AtomicAdd

在做直方图统计的时候，多个线程会同时访问直方图结果的数列，为了避免 race condition，需要用到 CUDA 的 atomicAdd 函数：

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
// CUDA Kernel
__global__ void HistogramKernel(unsigned char* d_image, unsigned int* d_histo, int width, int height) {
    
    // input_image size: width*height

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if ((row < height) && (col < width)) {
        int id = row*width + col;
        atomicAdd(&d_histo[d_image[id]], 1);
    }
}

进一步的，可以使用 shared memory 来提升性能。Using Shared Memory in CUDA C/C++ 中给出了一些介绍及基本的例子。

附上完整 rgb2gray.cu 代码：

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
/* 
 * Convert RGB image to grayscale
 * 
 * Load image with lodepng (https://github.com/lvandeve/lodepng)
 *
 * Sample image: lena.png
 *
 * Y = 0.2126*R + 0.7152*G + 0.0722*B
 *
 * */

#include <iostream>
#include <string>

#include "cuda_runtime.h"
#include "lodepng.h"

#define CHANNELS 4          // RGBA in PNG
#define BLOCK_SIZE 16       // Thread block size


// CUDA Kernel
__global__ void RGB2GrayKernel(unsigned char* input_image, 
                               unsigned char* output_image,
                               int width, int height) {
    
    // input_image size: width*height*Channels
    // output_image size: width*height

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if ((row < height) && (col < width)) {
        // Offset in Grayscale image
        int offset = row*width + col;
        
        // Get RGB values
        unsigned char r = input_image[offset * CHANNELS + 0];
        unsigned char g = input_image[offset * CHANNELS + 1];
        unsigned char b = input_image[offset * CHANNELS + 2];

        // Convert to grayscale
        output_image[offset] = 0.2126f*r + 0.7152f*g + 0.0722f*b;
    }
}


/* Main */
int main(int argc, char* argv[]) {
    const char* input_file = argc > 1 ? argv[1] : "lena.png";
    
    // Variables
    std::vector<unsigned char> image;
    unsigned int width, height;

    // Load image
    unsigned error = lodepng::decode(image, width, height, input_file);

    std::cout << width << " x " << height << std::endl;

    // Allocate memory for CUDA device
    size_t mem_size = width * height * sizeof(unsigned char);
    unsigned char* dev_input, *dev_output;

    cudaMalloc(&dev_input, mem_size * CHANNELS);
    cudaMemcpy(dev_input, image.data(), mem_size * CHANNELS, cudaMemcpyHostToDevice);

    cudaMalloc(&dev_output, mem_size);

    // Invoke CUDA kernel
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);

    RGB2GrayKernel<<<dimGrid, dimBlock>>>(dev_input, dev_output, 
                                          width, height);

    // Copy output from device
    unsigned char* image_y = new unsigned char[width*height];
    cudaMemcpy(image_y, dev_output, mem_size, cudaMemcpyDeviceToHost);

    cudaFree(dev_input);
    cudaFree(dev_output);

    // Prepare for output
    std::vector<unsigned char> out_image(image.size(), 255);
    for (size_t i = 0; i < width*height; i++) {
        size_t offset = i * CHANNELS;
        out_image[offset + 0] = out_image[offset + 1] = out_image[offset + 2] = image_y[i];
    }

    // Save processed image
    std::string output_file(input_file);
    output_file += ".gray.png";

    error = lodepng::encode(output_file, out_image, width, height);

    delete[] image_y;
    return 0;
}

部分参考链接：

CUDA C++ 实现图像处理

读取保存图像

CUDA Kernel

Timing

编译

Histogram 与 AtomicAdd

相关文章：