使用std::vector的OpenCL矩阵乘法
OpenCL Matrix Multiplication Using std::vector
我正试图使用OpenCl来预处理向量乘法,而代码本身似乎可以工作——返回的结果要么是垃圾,要么是零。据我所知,内核似乎没有接收到正确的值,或者我在这里缺少了一些不明显的东西,是什么?我以为这是我分配缓冲区的方式,但我不确定。
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#include <iostream>
#include <fstream>
#include <sstream>
#include "./cl.hpp"
void populate_vector(std::vector<float> &vect, std::stringstream &readStream) {
std::string x;
std::string fStripped;
float readFloat;
while(std::getline(readStream, x, ',')){
std::stringstream elementStream;
elementStream << x;
std::getline(elementStream, fStripped, 'f');
elementStream << fStripped;
elementStream >> readFloat;
vect.push_back(readFloat);
}
}
int main()
{
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if(platforms.empty()){
throw std::runtime_error("No Platforms found, check OpenCL installation.");
}
cl::Platform platform = platforms[0];
std::cout << "Using Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl;
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
if(devices.empty()){
throw std::runtime_error ("No Devices Found, check installation.");
}
cl::Device device = devices[0];
// Create an execusion context
cl::Context context(device);
cl::CommandQueue queue(context,device);
// Load the kernel sources, use global memory
std::ifstream fs("mCrossProd.cl");
if(!fs.is_open()){
throw std::runtime_error("Cannot open kernel source file.");
}
// Extract kernel code
std::stringstream ss;
ss << fs.rdbuf();
auto code = ss.str();
cl::Program::Sources sources;
sources.push_back({code.c_str(), code.length()});
fs.close();
// Build the kernel
cl::Program program(context, sources);
if(program.build({device})!= CL_SUCCESS){
std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << "n";
exit(1);
}
// Output matrix dimensions
int M = 4, N = 3, K = 6;
int A_dims = M * K;
int B_dims = N * K;
int C_dims = M * N;
// Create buffers for device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*A_dims);
cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(float)*B_dims);
cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(float)*C_dims);
std::string s;
std::vector<float> A;
std::vector<float> B;
std::vector<float> C;
std::ifstream infile("matrices.txt");
std::cout << "Opened file to read" << 'n';
std::getline(infile,s);
//std::cout << s;
std::stringstream mss(s);
populate_vector(A, mss);
std::copy(A.begin(), A.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << 'n';
mss.str("");
mss.clear();
std::getline(infile,s);
mss << s;
populate_vector(B, mss);
std::copy(B.begin(), B.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << 'n';
mss.str("");
mss.clear();
std::getline(infile,s);
mss << s;
populate_vector(C, mss);
std::copy(C.begin(), C.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << 'n';
//write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,A.size()*sizeof(float),&A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,B.size()*sizeof(float),&B);
std::cout << A.size() * sizeof(float) << 'n';
std::cout << B.size() * sizeof(float) << 'n';
std::cout << C.size() * sizeof(float) << 'n';
// Select kernel, pass arguments
cl::Kernel kernel = cl::Kernel(program, "mCrossProd");
kernel.setArg(0, M);
kernel.setArg(1, N);
kernel.setArg(2, K);
kernel.setArg(3, buffer_A);
kernel.setArg(4, buffer_B);
kernel.setArg(5, buffer_C);
// Execute kernel
if( queue.enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(M,N),cl::NDRange(1,1)) != CL_SUCCESS )
{
std::cout << "Failed to launch kernel" << std::endl;
exit(1);
}
queue.finish();
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,C.size(),&C[0]);
std::cout << C.size() << std::endl;
std::cout << C_dims << std::endl;
std::cout << M << " " << N << std::endl;
std::cout << "nThe solution is" << std::endl;
std::copy(C.begin(), C.end(), std::ostream_iterator<float>(std::cout, ", "));
std::cout << 'n';
for(int i = 0; i < M; i++) {
for(int j = 0; j < N; j++) {
std::cout << "C[" + std::to_string(i*N+j) + "] = ";
std::cout << C[i*N+j] << " ";
}
std::cout << std::endl;
}
}
测试内核
__kernel void mCrossProd(const int M, const int N, const int K, __global float* A, __global float* B, __global float* C) {
int const i = get_global_id(0);
int const j = get_global_id(1);
int const debug_elem_id = 3; // purely for debug purposes.
for(int k = 0; k < K; k++){
C[i*N+j] += A[i*K+k] * B[N*k+j];
if((i*N+j)==debug_elem_id)
{
//printf("PROD, i = %d, j = %d, k = %d, N = %dn", i,j,k,N);
printf("PROD, %.2fn", A[i*K+k] * B[N*k+j]);
printf("SUM: %.2fn", C[i*N+j]);
}
}
}
matrix.txt 的内容
1.5f, 1.0f, 2.0f, 2.0f, 4.0f, 1.0f, 4.0f, 2.0f, 1.0f, 1.0f, 0.0f, 0.0f, 3.0f, 2.0f, 5.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 2.0f, 1.0f, 1.0f
1.5f, 2.0f, 4.0f, 1.0f, 1.0f, 2.0f, 4.0f, 2.0f, 1.0f, 0.0f, 0.0f, 1.0f, 9.0f, 2.0f, 1.0f, 2.0f, 1.0f, 0.0f
0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,A.size()*sizeof(float),&A);
queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,B.size()*sizeof(float),&B);
&A
应该是A.data()
或&A[0]
,我推荐第一个
相关文章:
- 使用std::vector的OpenCL矩阵乘法
- POCO::PostgreSQL:如何将std::vector支持添加到`Binder::bind`
- std::vector的包装器,使数组的结构看起来像结构的数组
- 编译器如何区分std::vector的构造函数
- 使用 pqxx 将 std::vector 存储在 postgresql 中,并从数据库中检索它
- 在std::vector上存储带有模板的类实例
- 在main()之外初始化std::vector会导致性能下降(多线程)
- 为什么std::vector比数组慢
- std::vector::迭代器是否可以合法地作为指针
- 如何将二进制格式的 C++ 对象的 std::vector 保存到磁盘?
- 为什么std::vector和std::valarray初始化构造函数不同
- ";结果类型必须是可从输入范围的值类型""构造的;创建std::vector时
- 在没有未定义行为的情况下实现类似std::vector的容器
- 如何调整 std::vector of Eigen::MatrixXd 的大小
- 使用 std::vector::reverse_iterator 将 int 序列化为字节向量?
- 如何将AERT_Allocate与 std:vector 一起使用
- 推导 std::vector::back() 的返回类型
- 如何将原始字节附加到 std::vector?
- std::vector 没有重载函数的实例与参数列表匹配
- 如果 KEY 是 std::list 或 std::vector 而不是值,那么 std::map 的默认行为是什么?