Parallelize code that uses struct and pointer to pointer in CUDA

Discussion in 'Programming' started by Sakbar1234, Nov 13, 2013.

  1. #1
    I have come up with the above implementation but am getting access violation error at cublasSdot. Any suggestion? I think the problem is when I try to linearize the weight. Thanks in advance.

    typedef struct {                    /* A LAYER OF A NET:                    */
            INT          Units;        /* - number of units in this layer      */
            REAL*        Output;        /* - output of ith unit                  */
            REAL*        Error;        /* - error term of ith unit              */
            REAL**        Weight;        /* - connection weights to ith unit      */
            REAL**        WeightSave;    /* - saved weights for stopped training  */
            REAL**        dWeight;      /* - last weight deltas for momentum    */
    } LAYER;
    
    typedef struct {                    /* A NET:                                */
            LAYER**      Layer;        /* - layers of this net                  */
            LAYER*        InputLayer;    /* - input layer                        */
            LAYER*        OutputLayer;  /* - output layer                        */
            REAL          Alpha;        /* - momentum factor                    */
            REAL          Eta;          /* - learning rate                      */
            REAL          Gain;          /* - gain of sigmoid function            */
            REAL          Error;        /* - total net error                    */
    } NET;
    __global__ void calculatelowerError_dev(REAL* lowerError_dev, REAL* lowerOutput_dev, REAL nGain, REAL* Err_dev, INT lowerUnits){
    
      int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
      if(idx < lowerUnits){
        lowerError_dev[idx] = nGain * lowerOutput_dev[idx] * (1 - lowerOutput_dev[i]) * Err_dev[idx];
      }
    }
    
    void BackpropagateLayer(NET* Net, LAYER* Upper, LAYER* Lower)
    {
      INT  i,j;
      REAL Out;
      REAL* Err;
      REAL nGain = Net->Gain;
      INT lowerUnits = Lower->Units;
      INT upperUnits = Upper->Units;
      REAL* lowerOutput = Lower->Output;
      REAL* upperError = Upper->Error;
      REAL* lowerError = Lower->Error;
      REAL* linearWeight;
    
      REAL* linearWeight_dev;
      REAL* lowerOutput_dev;
      REAL* upperError_dev;
      REAL* lowerError_dev;
      REAL* Err_dev;
    
      linearWeight = (REAL*) calloc(lowerUnits * upperUnits, sizeof(REAL));
      Err = (REAL*) calloc(lowerUnits, sizeof(REAL));
      for (i=1; i<=lowerUnits; i++){
        for (j=1; j<=upperUnits; j++){
          linearWeight[(i*upperUnits)+j] = Upper->Weight[j][i];
        }
      }
      cudaMalloc((void**) &linearWeight_dev, sizeof(REAL) * lowerUnits * upperUnits);
      cudaMalloc((void**) &upperError_dev, sizeof(REAL) * upperUnits);
      cudaMalloc((void**) &Err_dev, sizeof(REAL) * lowerUnits);
    
      cudaMemcpy(linearWeight_dev, linearWeight, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);
      cudaMemcpy(upperError_dev, upperError, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);
    
      for (i=1; i<=lowerUnits;i++){
        cublasSdot(handle, upperUnits, linearWeight+i, 1, upperError_dev, 1, Err+i);
      }
    
      cudaMemcpy(Err_dev, Err, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);
    
      cudaMalloc((void**) &lowerError_dev, sizeof(REAL) * lowerUnits);
      threads_per_block = MAX_THREADS_PER_BLOCK;
      nblocks = (lowerUnits + threads_per_block - 1) / threads_per_block;
      calculatelowerError_dev<<< nblocks, threads_per_block>>>(lowerError_dev, lowerOutput_dev, nGain, Err_dev, lowerUnits);
    
      cudaMemcpy(lowerError, lowerError_dev, sizeof(REAL) * lowerUnits, cudaMemcpyDeviceToHost);
    
    }
    Code (markup):

     
    Sakbar1234, Nov 13, 2013 IP