使用CUDA和瓦片演算法計算矩陣乘法

2021-10-10 18:36:35 字數 3067 閱讀 1135

有很多網友分享的瓦片演算法實現中,執行緒函式的**有些問題(例如只對整倍塊的矩陣能計算出正確結果,而對非整倍塊的矩陣沒有考慮到為殘餘快取清零,導致計算結果錯誤),請看本例中的執行緒函式及相應注釋

​#include #include #include #include #include template __global__ void matrixmulcuda(float *c, float *a,float *b,int wa,int ha,int wb,int hb) 

if(row < ha && col < wb)

c[row*wb+col] = c;

}void constantinit(float *data, int size)

//printf("\n");

}int matrixmultiply(int block_size, const dim3 &dimsa,const dim3 &dimsb)

float *d_a, *d_b, *d_c;

//在視訊記憶體中分配儲存

checkcudaerrors(cudamalloc(reinterpret_cast(&d_a), mem_size_a));

checkcudaerrors(cudamalloc(reinterpret_cast(&d_b), mem_size_b));

checkcudaerrors(cudamalloc(reinterpret_cast(&d_c), mem_size_c));

//建立流物件,用於任務級(grid)同步

cudastream_t stream;

checkcudaerrors(cudastreamcreatewithflags(&stream, cudastreamnonblocking));

checkcudaerrors(cudamemcpyasync(d_a, h_a, mem_size_a, cudamemcpyhosttodevice, stream));

checkcudaerrors(cudamemcpyasync(d_b, h_b, mem_size_b, cudamemcpyhosttodevice, stream));

//啟動計算

dim3 threads(block_size, block_size);

dim3 grid(ceil(1.0*dimsb.x / threads.x), ceil(1.0 * dimsa.y / threads.y));

if (block_size == 16)

matrixmulcuda<16> <<< grid, threads,16*16, stream>>>

(d_c, d_a, d_b,dimsa.x, dimsa.y, dimsb.x, dimsb.y);

else

matrixmulcuda<32> <<< grid, threads,32*32, stream>>>

(d_c, d_a, d_b,dimsa.x, dimsa.y, dimsb.x, dimsb.y);

checkcudaerrors(cudastreamsynchronize(stream));//同步stream上的執行緒

//獲取計算結果

checkcudaerrors(cudamemcpyasync(h_c, d_c, mem_size_c, cudamemcpydevicetohost, stream));

checkcudaerrors(cudastreamsynchronize(stream));

for (int i = 0; i < static_cast(dimsc.x * dimsc.y); i++)

printf("%.0f,", h_c[i]);

//計算效能測試

cudaevent_t start, stop;//建立事件物件,用於記錄計算時間

int niter = 300;

checkcudaerrors(cudaeventcreate(&start));

checkcudaerrors(cudaeventcreate(&stop));

checkcudaerrors(cudaeventrecord(start, stream));

for (int j = 0; j < niter; j++) else

}checkcudaerrors(cudaeventrecord(stop, stream));

checkcudaerrors(cudaeventsynchronize(stop));

float msectotal = 0.0f;

checkcudaerrors(cudaeventelapsedtime(&msectotal, start, stop));

float msecpermatrixmul = msectotal / niter;

double flopspermatrixmul = 2.0 * static_cast(dimsa.x)*static_cast(dimsa.y)*static_cast(dimsb.x);

double gigaflops = (flopspermatrixmul * 1.0e-9f) / (msecpermatrixmul / 1000.0f);

printf("\n浮點計算效能:%.2fgflop/s, 單趟耗時: %.3fms\n",gigaflops,msecpermatrixmul);

//清理

free(h_a);

free(h_b);

free(h_c);

checkcudaerrors(cudafree(d_a));

checkcudaerrors(cudafree(d_b));

checkcudaerrors(cudafree(d_c));

checkcudaerrors(cudaeventdestroy(start));

checkcudaerrors(cudaeventdestroy(stop));

checkcudaerrors(cudastreamdestroy(stream));

return exit_success;

}int main(int argc, char **ar**)

瓦片範圍計算(根據範圍和級別)(Scala)

package com.geoway.gspark.common import com.vividsolutions.jts.geom.import scala.collection.mutable.arraybuffer import com.vividsolutions.jts.io.wktre...

計算矩陣演算法

to change this template,choose tools templates and open the template in the editor.package testmutlchat author administrator public class computematri...

CUDA使用筆記(一)矩陣乘法

簡介 本文介紹cublassgemm 函式的使用。在c c 中,通常我們將2維矩陣按行儲存為一維陣列。但是在視訊記憶體中,矩陣是按列儲存的。因此,我們在實際使用時,對cublassgemm 中的各個引數的賦值可能會搞不清楚。本文,以乙個具體的矩陣乘法案例為例子,介紹cublassgemm 函式的使用...