Strumenti Utente

Strumenti Sito


magistraleinformaticanetworking:spm:skepu-cuda-map

Sample map in CUDA

This is just a code excerpt of a single map operation implemented directly using CUDA.

mapcuda.cpp
          const int CHUNK_SIZE = 128 * 1024 * 1024;
          out_type* output = new out_type[size]; 
          int num_items = CHUNK_SIZE / sizeof(in_type); 
          int chunks = (size + num_items - 1) / num_items; 
          dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 
          int threadsPerBlockTotal = BLOCK_SIZE * BLOCK_SIZE; 
          int numBlocksX = 256; 
          int numBlocksY = (num_items + threadsPerBlockTotal *  num BlocksX - 1) / (threadsPerBlockTotal * numBlocksX); 
          dim3 dimGrid(numBlocksX, numBlocksY); 
          in_type *d_input;
          out_type *d_output;
          cudaMalloc(&d_input, num_items * sizeof(in_type)); 
          cudaMalloc(&d_output, num_items * sizeof(out_type)); 
          for(int ind = 0; ind < chunks; ind++) {
             if(ind == chunks - 1) {
                int remains = size % num_items;
                cudaMemcpy(d_input, input + num_items * 
                ind, remains * sizeof(in_type), 
                cudaMemcpyHostToDevice);
                map_kernel<<<dimGrid, dimBlock>>>(d_input,  d_output, remains);
                cudaThreadSynchronize();
                cudaMemcpy(output + num_items * ind, d_output, 
                           remains * sizeof(out_type), 
                           cudaMemcpyDeviceToHost);
             } else { 
                cudaMemcpy(d_input, input + num_items * 
                           ind, num_items * sizeof(in_type), 
                           cudaMemcpyHostToDevice);
                map_kernel<<<dimGrid, dimBlock>>>(d_input, 
                                       d_output, num_items);
                cudaThreadSynchronize();
                cudaMemcpy(output + num_items * ind, d_output, 
                           num_items * sizeof(out_type), 
                           cudaMemcpyDeviceToHost);
             }
         }\
         cudaFree(d_input); cudaFree(d_output); 
         cudaError_t error = cudaGetLastError();
         const char* lerror = cudaGetErrorString(error);
         cout << lerror << endl;
 
magistraleinformaticanetworking/spm/skepu-cuda-map.txt · Ultima modifica: 28/10/2013 alle 17:45 (11 anni fa) da Marco Danelutto