Hi
How does the kernel function know which block to work with when looping? I have looked over the examples but could not identify one that resembles this case. Is there any chance you could give me a small example to solve this? Ignoring the leftovers at the moment, I have done this:
How does the kernel function know which block to work with when looping? I have looked over the examples but could not identify one that resembles this case. Is there any chance you could give me a small example to solve this? Ignoring the leftovers at the moment, I have done this:
int firstRow = raster.StartRow;
int lastRow = raster.EndRow;
int firstCol = raster.StartColumn;
int lastCol = raster.EndColumn;
int nbRows = lastRow - firstRow + 1;
int nbCols = lastCol - firstCol + 1;
int thresHold = 50;
var cpuMemIn = new float[nbRows*nbCols];
int i2 = 0;
for (var i = firstRow; i <= lastRow; i++)
for (var j = firstCol; j <= lastCol; j++)
if (raster.Value[i, j] > 0)
cpuMemIn[i2++] = (float) raster.Value[i, j];
else
cpuMemIn[i2++] = -1;
// Precalculate relative distances to spare workload for gpu
int radiusNbCells = 15; // Number of cells that corresponds to the radius of the neigborhood circle
var windowWidth = radiusNbCells*2 + 1;
var cpuDistInSqr = new int[windowWidth*windowWidth]; // Squared distance between two cells with running index distindex
var distIndex = 0;
for (int i = -radiusNbCells; i <= radiusNbCells; i++)
for (int j = -radiusNbCells; j <= radiusNbCells; j++)
cpuDistInSqr [distIndex++] = i * i + j * j;
var gpuMemIn = gpu.Allocate(cpuMemIn);
var gpuMemDistIn = gpu.Allocate(cpuDistInSqr);
var resultArray = new float[nbRows*nbCols];
var gpuMemOut = gpu.Allocate(resultArray);
var constValue = new int[4];
constValue[0] = lastRow;
constValue[1] = lastCol;
constValue[2] = thresHold;
constValue[3] = radiusNbCells;
var gpuConstValue = gpu.Allocate(constValue);
int nbLoops = nbCols*nbRows/512;
for (int i = 0; i < nbLoops; i++)
{
gpu.Launch(512, 512, NeighborhoodKernel, i, gpuConstValue, gpuMemIn, gpuMemDistIn, gpuMemOut);
gpu.Synchronize(); // Required??
}
gpu.CopyFromDevice(gpuMemOut, resultArray);
[Cudafy]
public static void NeighborhoodKernel(GThread thread, int loopCounter, int[] constValues, float[] dataIn, int[] distInSqr, float[] dataOut)
{
int x = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x; // Correct???
int y = thread.threadIdx.y + thread.blockIdx.y * thread.blockDim.y; // Correct???
int lastRow = constValues[0];
int lastCol = constValues[1];
int thresHold = constValues[2];
int radiusNbCells = constValues[3];
// Loop required?
dataOut[???] = GetNeighborhoodValue(y, x, radiusNbCells, distInSqr, lastRow, lastCol, dataIn, thresHold); // Correct passing y as row index?
}
[Cudafy]
public static float GetNeighborhoodValue(int i, int j, int radiusNbCells, int[] distInSqr, int lastRow, int lastCol,
float[] dataIn, int thresHold)
{
float tot = 0;
int maxRadiusNbCellsSqr = radiusNbCells * radiusNbCells;
int relRowNum = -1;
for (int row = i - radiusNbCells; row <= i + radiusNbCells; row++)
{
relRowNum++;
int distIndex = relRowNum * (2 * radiusNbCells + 1) - 1;
if (row > lastRow || row < 0) continue;
// int y2 = (row - i) * (row - i);
for (int col = j - radiusNbCells; col <= j + radiusNbCells; col++)
{
int rasterIndex = row * (lastCol + 1) + col;
distIndex++;
if (col > lastCol || col < 0)
continue;
if (distInSqr[distIndex] <= maxRadiusNbCellsSqr && dataIn[rasterIndex] > 0.0f)
{
tot += dataIn[rasterIndex];
}
}
}
// Reclassify
return (tot >= thresHold) ? 1.0f : 0.0f;
}