I recently start to learn opencl and how to create kernel for OpenCV as well. I am still working with the basics.
I need to implement an opencl kernel for calculate the covariance matrix. This function don’t have any opencl kernel already implemented by opencv’s fundation.
Technically what I want to process is that :
cv::RNG tutu(std::time(nullptr)); // This is only for check the improvement of the method over the executions int sz = tutu.uniform(1,20); cv::Mat_<float> a1(1,sz); cv::Mat_<float> a2(a1.size()); cv::Mat_<float> c2; for(std::size_t i=0;i<sz;i++) { a1(i) = tutu.uniform(0,300); a2(i) = tutu.uniform(300,600); } cv::Mat_<float> f; cv::vconcat(a1,a2,f); // process the Covariance Matrix : cv::gemm(one,f,-0.5f,f,1.f,c2,0); cv::gemm(c2.t(),c2,1.f,cv::noArray(),0.f,c2);
I found OpenCV’s fundation have implemented an OpenCL kernel for both gemm and the function transpose.
So I tried to derivate an implementation from an exemple directly from OpenCV’s source.
I wrote this :
ocl.h :
void CovarMatrix( cv::Mat_<float>& src,cv::Mat_<float>& covar);
ocl.cpp :
#include <memory> #include <fstream> #include <sstream> #include <iterator> #include <opencv2/core.hpp> #include <opencv2/core/ocl.hpp> namespace test { namespace ocl { namespace { std::unique_ptr<cv::ocl::ProgramSource> cov_src; void init_cov() { std::ifstream stream("../mahalanobis/covarianceMatrix.cl"); std::ostringstream sstream; sstream << stream.rdbuf(); cv::String norm_file_content = sstream.str(); stream.close(); cov_src.reset(new cv::ocl::ProgramSource(norm_file_content)); } } static bool ocl_gemm( cv::Mat_<float>& matA, cv::Mat_<float>& matB, cv::Mat_<float>& CV_OUT matD) { cv::Mat_<float> tmp; cv::Mat_<float> tmp2; cv::Size sizeA = matA.size(), sizeB = matB.size(); cv::Size sizeD(sizeB.width, sizeA.height); const cv::ocl::Device & dev = cv::ocl::Device::getDefault(); int max_wg_size = (int)dev.maxWorkGroupSize(); int block_size = (max_wg_size / 32 < 32) ? (max_wg_size / 16 < 16) ? (max_wg_size / 8 < 8) ? 1 : 8 : 16 : 32; // matD.create(sizeD); // tmp2.create(matD.t().size()); tmp.create(sizeD); tmp2.create(tmp.t().size()); matD.create(sizeD.width,sizeD.width); cv::UMat A = matA.getUMat(cv::ACCESS_READ,cv::USAGE_ALLOCATE_DEVICE_MEMORY); cv::UMat B = matB.getUMat(cv::ACCESS_READ,cv::USAGE_ALLOCATE_DEVICE_MEMORY); // cv::UMat D = matD.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY); cv::UMat D = tmp.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY); // cv::UMat E(sizeD.width,sizeD.height,CV_32FC1,cv::Scalar::all(0.),cv::USAGE_ALLOCATE_DEVICE_MEMORY); cv::UMat E = tmp2.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY); cv::UMat F = matD.getUMat(cv::ACCESS_WRITE,cv::USAGE_ALLOCATE_DEVICE_MEMORY); matB.copyTo(D); int vectorWidths[] = { 4, 4, 2, 2, 1, 4, 1, -1 }; int kercn = cv::ocl::checkOptimalVectorWidth(vectorWidths, B, D); cv::String opts = cv::format( "-I /home/administrateur/lib_dir/opencv_dir/opencv_304/opencv/modules/core/src/opencl/ -D T=float -D T1=float -D WT=%s -D cn=1 -D kercn=%d -D LOCAL_SIZE=%d %s -D HAVE_C -D TILE_DIM=32 -D BLOCK_ROWS=8 -D rowsPerWI=1 ", cv::ocl::typeToStr(CV_32FC(kercn)), kercn, block_size, (sizeA.width % block_size !=0) ? "-D NO_MULT" : ""); cv::ocl::Kernel k("covarianceMatrix", *cov_src, opts); k.args(cv::ocl::KernelArg::ReadOnlyNoSize(A), cv::ocl::KernelArg::ReadOnlyNoSize(B, 1, kercn), cv::ocl::KernelArg::ReadWrite(D, 1, kercn), sizeA.width, cv::ocl::KernelArg::ReadWrite(E,kercn,1), cv::ocl::KernelArg::ReadWrite(F,kercn,kercn) ); std::size_t globalsize[2] = { static_cast<std::size_t>(sizeD.width / kercn), static_cast<std::size_t>(sizeD.height)}; std::size_t localsize[2] = { static_cast<std::size_t>(block_size), static_cast<std::size_t>(block_size)}; return k.run(2, globalsize, block_size!=1 ? localsize : nullptr, false); } void CovarMatrix( cv::Mat_<float>& src,cv::Mat_<float>& covar) { if(!covar.empty()) covar.release(); cv::Mat_<float> o = cv::Mat_<float>::ones(src.rows,src.rows); if(!cov_src) init_cov(); ocl_gemm(o,src,covar); }
covarianceMatrix.cl :
#include "gemm.cl" #include "transpose.cl" __kernel void covarianceMatrix ( __global const uchar * A_ptr, int A_step, int A_offset, __global const uchar * B_ptr, int B_step, int B_offset, __global uchar * D_ptr, int D_step, int D_offset, int D_rows, int D_cols, int n, __global uchar * E_ptr, int E_step, int E_offset, int E_rows, int E_cols, __global uchar * F_ptr, int F_step, int F_offset, int F_rows, int F_cols ) { // cv::gemm(src2,src1,-0.5,src1,1.f,src2); // cv::gemm(src2.t(),src2,1.f,cv::noArray(),0.f,dest); gemm(A_ptr,A_step,A_offset, B_ptr,B_step,B_offset, D_ptr,D_step,D_offset,D_rows,D_cols, n,-0.5f,1.f); transpose(D_ptr,D_step,D_offset,D_rows,D_cols*sizeof(float), E_ptr,E_step,E_offset); gemm(E_ptr,E_step,E_offset, D_ptr,D_step,D_offset, F_ptr,F_step,F_offset,F_rows,F_cols, n,1.f,0.f); }
If the size of the matrix is fewer than 6 is work perfectly :). Otherwise … not really. It can be check with this code :
cv::RNG tutu(std::time(nullptr)); int sz = tutu.uniform(1,20); cv::Mat_<float> a1(1,sz); cv::Mat_<float> a2(a1.size()); for(std::size_t i=0;i<sz;i++) { a1(i) = tutu.uniform(0,300); a2(i) = tutu.uniform(300,600); } cv::Mat_<float> f; cv::vconcat(a1,a2,f); cv::Mat_<float> c1; cv::Mat_<float> c2; cv::Mat_<float> mean; // reference cv::calcCovarMatrix(f,c1,mean,cv::COVAR_ROWS | cv::COVAR_NORMAL,CV_32F); // check test::ocl::CovarMatrix(f,c2); std::size_t cnt(0.f); for(auto it = c1.begin(),it2 = c2.begin();it != c1.end();it++,it2++) if(*it == *it2) cnt++; std::cout<<"check "<<cnt<<" "<<c1.total()<<std::endl;
I am still new in OpenCL and I am interrested to know what I did wrong.
Does someone already implemented an OpenCL kernel for process the covariance matrix with OpenCV ?
Thank in advance for any help.
Advertisement
Answer
I resign myself to write this code :
void ocl_CovarMatrix(cv::Mat_<float>& src,cv::Mat_<float>& covar) { cv::UMat usrc = src.getUMat(cv::ACCESS_READ,cv::USAGE_ALLOCATE_DEVICE_MEMORY); cv::UMat ones = cv::UMat::ones(usrc.rows,usrc.rows,usrc.type()); cv::UMat utmp; double beta = 1.; double alpha = -1. / static_cast<double>(usrc.rows); cv::gemm(ones,usrc,alpha,usrc,beta,utmp); cv::gemm(utmp.t(),utmp,beta,cv::noArray(),0.,utmp); utmp.copyTo(covar); ones.release(); utmp.release(); usrc.release(); }
I suspect the GPU memory is updated everytime a function is call, that make that code slower rather than if it has been written in one Kernel. But it work efficiently.
I am still interested by another solution if maybe someone have an idea.