const int CHUNK_SIZE = 128 * 1024 * 1024; out_type* output = new out_type[size]; int num_items = CHUNK_SIZE / sizeof(in_type); int chunks = (size + num_items - 1) / num_items; dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); int threadsPerBlockTotal = BLOCK_SIZE * BLOCK_SIZE; int numBlocksX = 256; int numBlocksY = (num_items + threadsPerBlockTotal * num BlocksX - 1) / (threadsPerBlockTotal * numBlocksX); dim3 dimGrid(numBlocksX, numBlocksY); in_type *d_input; out_type *d_output; cudaMalloc(&d_input, num_items * sizeof(in_type)); cudaMalloc(&d_output, num_items * sizeof(out_type)); for(int ind = 0; ind < chunks; ind++) { if(ind == chunks - 1) { int remains = size % num_items; cudaMemcpy(d_input, input + num_items * ind, remains * sizeof(in_type), cudaMemcpyHostToDevice); map_kernel<<>>(d_input, d_output, remains); cudaThreadSynchronize(); cudaMemcpy(output + num_items * ind, d_output, remains * sizeof(out_type), cudaMemcpyDeviceToHost); } else { cudaMemcpy(d_input, input + num_items * ind, num_items * sizeof(in_type), cudaMemcpyHostToDevice); map_kernel<<>>(d_input, d_output, num_items); cudaThreadSynchronize(); cudaMemcpy(output + num_items * ind, d_output, num_items * sizeof(out_type), cudaMemcpyDeviceToHost); } }\ cudaFree(d_input); cudaFree(d_output); cudaError_t error = cudaGetLastError(); const char* lerror = cudaGetErrorString(error); cout << lerror << endl;