使用CUDA和瓦片演算法計算矩陣乘法

有很多網友分享的瓦片演算法實現中，執行緒函式的**有些問題（例如只對整倍塊的矩陣能計算出正確結果，而對非整倍塊的矩陣沒有考慮到為殘餘快取清零，導致計算結果錯誤），請看本例中的執行緒函式及相應注釋

#include #include #include #include #include template __global__ void matrixmulcuda(float *c, float *a,float *b,int wa,int ha,int wb,int hb) 
if(row < ha && col < wb)
c[row*wb+col] = c;
}void constantinit(float *data, int size) 
//printf("\n");
}int matrixmultiply(int block_size, const dim3 &dimsa,const dim3 &dimsb) 
float *d_a, *d_b, *d_c;
//在視訊記憶體中分配儲存
checkcudaerrors(cudamalloc(reinterpret_cast(&d_a), mem_size_a));
checkcudaerrors(cudamalloc(reinterpret_cast(&d_b), mem_size_b));
checkcudaerrors(cudamalloc(reinterpret_cast(&d_c), mem_size_c));
//建立流物件，用於任務級(grid)同步
cudastream_t stream;
checkcudaerrors(cudastreamcreatewithflags(&stream, cudastreamnonblocking));
checkcudaerrors(cudamemcpyasync(d_a, h_a, mem_size_a, cudamemcpyhosttodevice, stream));
checkcudaerrors(cudamemcpyasync(d_b, h_b, mem_size_b, cudamemcpyhosttodevice, stream));
//啟動計算
dim3 threads(block_size, block_size);
dim3 grid(ceil(1.0*dimsb.x / threads.x), ceil(1.0 * dimsa.y / threads.y));
if (block_size == 16) 
matrixmulcuda<16> <<< grid, threads,16*16, stream>>>
(d_c, d_a, d_b,dimsa.x, dimsa.y, dimsb.x, dimsb.y);
else 
matrixmulcuda<32> <<< grid, threads,32*32, stream>>>
(d_c, d_a, d_b,dimsa.x, dimsa.y, dimsb.x, dimsb.y);
checkcudaerrors(cudastreamsynchronize(stream));//同步stream上的執行緒
//獲取計算結果
checkcudaerrors(cudamemcpyasync(h_c, d_c, mem_size_c, cudamemcpydevicetohost, stream));
checkcudaerrors(cudastreamsynchronize(stream));
for (int i = 0; i < static_cast(dimsc.x * dimsc.y); i++)
printf("%.0f,", h_c[i]);
//計算效能測試
cudaevent_t start, stop;//建立事件物件，用於記錄計算時間
int niter = 300;
checkcudaerrors(cudaeventcreate(&start));
checkcudaerrors(cudaeventcreate(&stop));
checkcudaerrors(cudaeventrecord(start, stream));
for (int j = 0; j < niter; j++) else 
}checkcudaerrors(cudaeventrecord(stop, stream));
checkcudaerrors(cudaeventsynchronize(stop));
float msectotal = 0.0f;
checkcudaerrors(cudaeventelapsedtime(&msectotal, start, stop));
float msecpermatrixmul = msectotal / niter;
double flopspermatrixmul = 2.0 * static_cast(dimsa.x)*static_cast(dimsa.y)*static_cast(dimsb.x);
double gigaflops = (flopspermatrixmul * 1.0e-9f) / (msecpermatrixmul / 1000.0f);
printf("\n浮點計算效能：%.2fgflop/s, 單趟耗時： %.3fms\n",gigaflops,msecpermatrixmul);
//清理
free(h_a);
free(h_b);
free(h_c);
checkcudaerrors(cudafree(d_a));
checkcudaerrors(cudafree(d_b));
checkcudaerrors(cudafree(d_c));
checkcudaerrors(cudaeventdestroy(start));
checkcudaerrors(cudaeventdestroy(stop));
checkcudaerrors(cudastreamdestroy(stream));
return exit_success;
}int main(int argc, char **ar**)

瓦片範圍計算（根據範圍和級別）（Scala）

package com.geoway.gspark.common import com.vividsolutions.jts.geom.import scala.collection.mutable.arraybuffer import com.vividsolutions.jts.io.wktre...

計算矩陣演算法

to change this template,choose tools templates and open the template in the editor.package testmutlchat author administrator public class computematri...

CUDA使用筆記（一）矩陣乘法

簡介本文介紹cublassgemm 函式的使用。在c c 中，通常我們將2維矩陣按行儲存為一維陣列。但是在視訊記憶體中，矩陣是按列儲存的。因此，我們在實際使用時，對cublassgemm 中的各個引數的賦值可能會搞不清楚。本文，以乙個具體的矩陣乘法案例為例子，介紹cublassgemm 函式的使用...

使用CUDA和瓦片演算法計算矩陣乘法

瓦片範圍計算（根據範圍和級別）（Scala）

計算矩陣演算法

CUDA使用筆記（一）矩陣乘法

相關推薦