I have come up with the above implementation but am getting access violation error at cublasSdot. Any suggestion? I think the problem is when I try to linearize the weight. Thanks in advance. typedef struct { /* A LAYER OF A NET: */ INT Units; /* - number of units in this layer */ REAL* Output; /* - output of ith unit */ REAL* Error; /* - error term of ith unit */ REAL** Weight; /* - connection weights to ith unit */ REAL** WeightSave; /* - saved weights for stopped training */ REAL** dWeight; /* - last weight deltas for momentum */ } LAYER; typedef struct { /* A NET: */ LAYER** Layer; /* - layers of this net */ LAYER* InputLayer; /* - input layer */ LAYER* OutputLayer; /* - output layer */ REAL Alpha; /* - momentum factor */ REAL Eta; /* - learning rate */ REAL Gain; /* - gain of sigmoid function */ REAL Error; /* - total net error */ } NET; __global__ void calculatelowerError_dev(REAL* lowerError_dev, REAL* lowerOutput_dev, REAL nGain, REAL* Err_dev, INT lowerUnits){ int idx = blockDim.x * blockIdx.x + threadIdx.x; if(idx < lowerUnits){ lowerError_dev[idx] = nGain * lowerOutput_dev[idx] * (1 - lowerOutput_dev[i]) * Err_dev[idx]; } } void BackpropagateLayer(NET* Net, LAYER* Upper, LAYER* Lower) { INT i,j; REAL Out; REAL* Err; REAL nGain = Net->Gain; INT lowerUnits = Lower->Units; INT upperUnits = Upper->Units; REAL* lowerOutput = Lower->Output; REAL* upperError = Upper->Error; REAL* lowerError = Lower->Error; REAL* linearWeight; REAL* linearWeight_dev; REAL* lowerOutput_dev; REAL* upperError_dev; REAL* lowerError_dev; REAL* Err_dev; linearWeight = (REAL*) calloc(lowerUnits * upperUnits, sizeof(REAL)); Err = (REAL*) calloc(lowerUnits, sizeof(REAL)); for (i=1; i<=lowerUnits; i++){ for (j=1; j<=upperUnits; j++){ linearWeight[(i*upperUnits)+j] = Upper->Weight[j][i]; } } cudaMalloc((void**) &linearWeight_dev, sizeof(REAL) * lowerUnits * upperUnits); cudaMalloc((void**) &upperError_dev, sizeof(REAL) * upperUnits); cudaMalloc((void**) &Err_dev, sizeof(REAL) * lowerUnits); cudaMemcpy(linearWeight_dev, linearWeight, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice); cudaMemcpy(upperError_dev, upperError, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice); for (i=1; i<=lowerUnits;i++){ cublasSdot(handle, upperUnits, linearWeight+i, 1, upperError_dev, 1, Err+i); } cudaMemcpy(Err_dev, Err, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice); cudaMalloc((void**) &lowerError_dev, sizeof(REAL) * lowerUnits); threads_per_block = MAX_THREADS_PER_BLOCK; nblocks = (lowerUnits + threads_per_block - 1) / threads_per_block; calculatelowerError_dev<<< nblocks, threads_per_block>>>(lowerError_dev, lowerOutput_dev, nGain, Err_dev, lowerUnits); cudaMemcpy(lowerError, lowerError_dev, sizeof(REAL) * lowerUnits, cudaMemcpyDeviceToHost); } Code (markup):