1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define cudaCheckError(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while(0)
const int dy = 2;
const int dx = 3;
// A 是dy * dx, B 是dx * dy, C 是dy * dy
// C = AB 实现
__global__ void mmul(const float* A, const float* B, float* C, int dy, int dx) {
// m * k 和 k * n 矩阵乘法的通用写法
int m = dy, k = dx, n = dy;
// 计算当前线程对应的行和列
// 可以认为,只用row和col能用来表示C中元素的线程才会进行计算
// 不会发生线程的冗余计算
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// C[row][col] 为 A的第row行 * B的第col列
if (row < m && col < n) {
float sum = 0.0f;
for (int i = 0; i < k; i++) {
sum += A[row * k + i] * B[i * n + col]; // A 行主序,B 列主序
}
C[row * n + col] = sum; // C 行主序
}
}
int main() {
float* h_A, * h_B, * h_C, * d_A, * d_B, * d_C;
h_A = new float[dy * dx];
h_B = new float[dx * dy];
h_C = new float[dy * dy];
h_A[0] = 1.0f; h_A[1] = 2.0f; h_A[2] = 1.0f;
h_A[3] = 2.0f; h_A[4] = 2.0f; h_A[5] = 1.0f;
h_B[0] = 1.0f; h_B[1] = 2.0f;
h_B[2] = 2.0f; h_B[3] = 1.0f;
h_B[4] = 2.0f; h_B[5] = 2.0f;
cudaMalloc((void**)&d_A, dy * dx * sizeof(float));
cudaMalloc((void**)&d_B, dx * dy * sizeof(float));
cudaMalloc((void**)&d_C, dy * dy * sizeof(float));
cudaCheckError("cudaMalloc failuer");
cudaMemcpy(d_A, h_A, dy * dx * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, dx * dy * sizeof(float), cudaMemcpyHostToDevice);
cudaCheckError("cudaMemcpy error");
// 设备大小
dim3 block(2, 2); // 每个block有2*2个线程
// 通用公式
// grid((结果矩阵列数 + block.x - 1) / block.x,
// (结果矩阵行数 + block.y - 1) / block.y);
dim3 grid((dy + block.x - 1) / block.x, (dy + block.y - 1) / block.y); // 用来定义grid中block的数量
mmul << <grid, block >> > (d_A, d_B, d_C, dy, dx);
cudaCheckError("kernel launch failure");
cudaMemcpy(h_C, d_C, dy * dy * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 4; ++i) {
std::cout << h_C[i] << " ";
if ((i + 1) % 2 == 0) {
std::cout << std::endl;
}
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] h_A;
delete[] h_B;
delete[] h_C;
}
|