|
tesseract 3.04.01
|
00001 #ifdef _WIN32 00002 #include <windows.h> 00003 #include <io.h> 00004 00005 #else 00006 #include <sys/types.h> 00007 #include <unistd.h> 00008 #endif 00009 #include <float.h> 00010 00011 #include "openclwrapper.h" 00012 #include "oclkernels.h" 00013 00014 // for micro-benchmark 00015 #include "otsuthr.h" 00016 #include "thresholder.h" 00017 00018 #if ON_APPLE 00019 #include <stdio.h> 00020 #include <mach/mach_time.h> 00021 #endif 00022 00023 /* 00024 Convenience macro to test the version of Leptonica. 00025 */ 00026 #if defined(LIBLEPT_MAJOR_VERSION) && defined(LIBLEPT_MINOR_VERSION) 00027 # define TESSERACT_LIBLEPT_PREREQ(maj, min) \ 00028 ((LIBLEPT_MAJOR_VERSION) > (maj) || ((LIBLEPT_MAJOR_VERSION) == (maj) && (LIBLEPT_MINOR_VERSION) >= (min))) 00029 #else 00030 # define TESSERACT_LIBLEPT_PREREQ(maj, min) 0 00031 #endif 00032 00033 #if TESSERACT_LIBLEPT_PREREQ(1,73) 00034 # define CALLOC LEPT_CALLOC 00035 # define FREE LEPT_FREE 00036 #endif 00037 00038 #ifdef USE_OPENCL 00039 00040 #include "opencl_device_selection.h" 00041 GPUEnv OpenclDevice::gpuEnv; 00042 00043 00044 bool OpenclDevice::deviceIsSelected = false; 00045 ds_device OpenclDevice::selectedDevice; 00046 00047 00048 int OpenclDevice::isInited = 0; 00049 00050 struct tiff_transform { 00051 int vflip; /* if non-zero, image needs a vertical fip */ 00052 int hflip; /* if non-zero, image needs a horizontal flip */ 00053 int rotate; /* -1 -> counterclockwise 90-degree rotation, 00054 0 -> no rotation 00055 1 -> clockwise 90-degree rotation */ 00056 }; 00057 00058 static struct tiff_transform tiff_orientation_transforms[] = { 00059 {0, 0, 0}, 00060 {0, 1, 0}, 00061 {1, 1, 0}, 00062 {1, 0, 0}, 00063 {0, 1, -1}, 00064 {0, 0, 1}, 00065 {0, 1, 1}, 00066 {0, 0, -1} 00067 }; 00068 00069 static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000; 00070 00071 cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers 00072 cl_mem pixThBuffer; //output from thresholdtopix calculation 00073 cl_int clStatus; 00074 KernelEnv rEnv; 00075 00076 // substitute invalid characters in device name with _ 00077 void legalizeFileName( char *fileName) { 00078 //printf("fileName: %s\n", fileName); 00079 const char* invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches 00080 // for each invalid char 00081 for (int i = 0; i < strlen(invalidChars); i++) { 00082 char invalidStr[4]; 00083 invalidStr[0] = invalidChars[i]; 00084 invalidStr[1] = NULL; 00085 //printf("eliminating %s\n", invalidStr); 00086 //char *pos = strstr(fileName, invalidStr); 00087 // initial ./ is valid for present directory 00088 //if (*pos == '.') pos++; 00089 //if (*pos == '/') pos++; 00090 for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) { 00091 //printf("\tfound: %s, ", pos); 00092 pos[0] = '_'; 00093 //printf("fileName: %s\n", fileName); 00094 } 00095 } 00096 } 00097 00098 void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) { 00099 //printf("[DS] populateGPUEnvFromDevice\n"); 00100 size_t size; 00101 gpuInfo->mnIsUserCreated = 1; 00102 // device 00103 gpuInfo->mpDevID = device; 00104 gpuInfo->mpArryDevsID = new cl_device_id[1]; 00105 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID; 00106 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE , sizeof(cl_device_type), (void *) &gpuInfo->mDevType , &size); 00107 CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)"); 00108 // platform 00109 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID , &size); 00110 CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)"); 00111 // context 00112 cl_context_properties props[3]; 00113 props[0] = CL_CONTEXT_PLATFORM; 00114 props[1] = (cl_context_properties) gpuInfo->mpPlatformID; 00115 props[2] = 0; 00116 gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus); 00117 CHECK_OPENCL( clStatus, "populateGPUEnv::createContext"); 00118 // queue 00119 cl_command_queue_properties queueProperties = 0; 00120 gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus ); 00121 CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue"); 00122 00123 } 00124 00125 int OpenclDevice::LoadOpencl() 00126 { 00127 #ifdef WIN32 00128 HINSTANCE HOpenclDll = NULL; 00129 void * OpenclDll = NULL; 00130 //fprintf(stderr, " LoadOpenclDllxx... \n"); 00131 OpenclDll = static_cast<HINSTANCE>( HOpenclDll ); 00132 OpenclDll = LoadLibrary( "openCL.dll" ); 00133 if ( !static_cast<HINSTANCE>( OpenclDll ) ) 00134 { 00135 fprintf(stderr, "[OD] Load opencl.dll failed!\n"); 00136 FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) ); 00137 return 0; 00138 00139 } 00140 fprintf(stderr, "[OD] Load opencl.dll successful!\n"); 00141 #endif 00142 return 1; 00143 } 00144 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo ) 00145 { 00146 envInfo->mpkContext = gpuEnv.mpContext; 00147 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue; 00148 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0]; 00149 00150 return 1; 00151 } 00152 00153 cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus) 00154 { 00155 cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags), 00156 nElements * sizeof(l_uint32), hostbuffer, pStatus); 00157 00158 return membuffer; 00159 } 00160 00161 PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true) 00162 { 00163 PROCNAME("mapOutputCLBuffer"); 00164 if (!pixd) 00165 { 00166 if (memcopy) 00167 { 00168 if ((pixd = pixCreateTemplate(pixs)) == NULL) 00169 (PIX *)ERROR_PTR("pixd not made", procName, NULL); 00170 } 00171 else 00172 { 00173 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL) 00174 (PIX *)ERROR_PTR("pixd not made", procName, NULL); 00175 } 00176 } 00177 l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0, 00178 elements * sizeof(l_uint32), 0, NULL, NULL, NULL ); 00179 00180 if (memcopy) 00181 { 00182 memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32)); 00183 } 00184 else 00185 { 00186 pixSetData(pixd, pValues); 00187 } 00188 00189 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL); 00190 00191 if (sync) 00192 { 00193 clFinish( rEnv.mpkCmdQueue ); 00194 } 00195 00196 return pixd; 00197 } 00198 00199 cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false) 00200 { 00201 cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), 00202 nElements * sizeof(l_int32), NULL, pStatus); 00203 00204 if (_pValues != NULL) 00205 { 00206 l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, 00207 nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); 00208 00209 memcpy(pValues, _pValues, nElements * sizeof(l_int32)); 00210 00211 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); 00212 00213 if (sync) 00214 clFinish( rEnv.mpkCmdQueue ); 00215 } 00216 00217 return xValues; 00218 } 00219 00220 00221 void OpenclDevice::releaseMorphCLBuffers() 00222 { 00223 if (pixdCLIntermediate != NULL) 00224 clReleaseMemObject(pixdCLIntermediate); 00225 if (pixsCLBuffer != NULL) 00226 clReleaseMemObject(pixsCLBuffer); 00227 if (pixdCLBuffer != NULL) 00228 clReleaseMemObject(pixdCLBuffer); 00229 if (pixThBuffer != NULL) 00230 clReleaseMemObject(pixThBuffer); 00231 } 00232 00233 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs) 00234 { 00235 SetKernelEnv( &rEnv ); 00236 00237 if (pixThBuffer != NULL) 00238 { 00239 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); 00240 00241 //Get the output from ThresholdToPix operation 00242 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL); 00243 } 00244 else 00245 { 00246 //Get data from the source image 00247 l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32)); 00248 memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32)); 00249 00250 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus); 00251 } 00252 00253 pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); 00254 00255 pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); 00256 00257 return (int)clStatus; 00258 } 00259 00260 int OpenclDevice::InitEnv() 00261 { 00262 //PERF_COUNT_START("OD::InitEnv") 00263 // printf("[OD] OpenclDevice::InitEnv()\n"); 00264 #ifdef SAL_WIN32 00265 while( 1 ) 00266 { 00267 if( 1 == LoadOpencl() ) 00268 break; 00269 } 00270 PERF_COUNT_SUB("LoadOpencl") 00271 #endif 00272 // sets up environment, compiles programs 00273 00274 00275 InitOpenclRunEnv_DeviceSelection( 0 ); 00276 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS") 00277 //PERF_COUNT_END 00278 return 1; 00279 } 00280 00281 int OpenclDevice::ReleaseOpenclRunEnv() 00282 { 00283 ReleaseOpenclEnv( &gpuEnv ); 00284 #ifdef SAL_WIN32 00285 FreeOpenclDll(); 00286 #endif 00287 return 1; 00288 } 00289 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName ) 00290 { 00291 if ( kCount < 1 ) 00292 fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" ); 00293 strcpy( gpuEnv.mArrykernelNames[kCount-1], kName ); 00294 gpuEnv.mnKernelCount++; 00295 return 0; 00296 } 00297 int OpenclDevice::RegistOpenclKernel() 00298 { 00299 if ( !gpuEnv.mnIsUserCreated ) 00300 memset( &gpuEnv, 0, sizeof(gpuEnv) ); 00301 00302 gpuEnv.mnFileCount = 0; //argc; 00303 gpuEnv.mnKernelCount = 0UL; 00304 00305 AddKernelConfig( 1, (const char*) "oclAverageSub1" ); 00306 return 0; 00307 } 00308 00309 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) { 00310 //PERF_COUNT_START("InitOpenclRunEnv_DS") 00311 if (!isInited) { 00312 // after programs compiled, selects best device 00313 //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n"); 00314 ds_device bestDevice_DS = getDeviceSelection( ); 00315 //PERF_COUNT_SUB("called getDeviceSelection()") 00316 cl_device_id bestDevice = bestDevice_DS.oclDeviceID; 00317 // overwrite global static GPUEnv with new device 00318 if (selectedDeviceIsOpenCL() ) { 00319 //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n"); 00320 populateGPUEnvFromDevice( &gpuEnv, bestDevice ); 00321 gpuEnv.mnFileCount = 0; //argc; 00322 gpuEnv.mnKernelCount = 0UL; 00323 //PERF_COUNT_SUB("populate gpuEnv") 00324 CompileKernelFile(&gpuEnv, ""); 00325 //PERF_COUNT_SUB("CompileKernelFile") 00326 } else { 00327 //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n"); 00328 } 00329 isInited = 1; 00330 } 00331 //PERF_COUNT_END 00332 return 0; 00333 } 00334 00335 00336 OpenclDevice::OpenclDevice() 00337 { 00338 //InitEnv(); 00339 } 00340 00341 OpenclDevice::~OpenclDevice() 00342 { 00343 //ReleaseOpenclRunEnv(); 00344 } 00345 00346 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo ) 00347 { 00348 int i = 0; 00349 int clStatus = 0; 00350 00351 if ( !isInited ) 00352 { 00353 return 1; 00354 } 00355 00356 for ( i = 0; i < gpuEnv.mnFileCount; i++ ) 00357 { 00358 if ( gpuEnv.mpArryPrograms[i] ) 00359 { 00360 clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] ); 00361 CHECK_OPENCL( clStatus, "clReleaseProgram" ); 00362 gpuEnv.mpArryPrograms[i] = NULL; 00363 } 00364 } 00365 if ( gpuEnv.mpCmdQueue ) 00366 { 00367 clReleaseCommandQueue( gpuEnv.mpCmdQueue ); 00368 gpuEnv.mpCmdQueue = NULL; 00369 } 00370 if ( gpuEnv.mpContext ) 00371 { 00372 clReleaseContext( gpuEnv.mpContext ); 00373 gpuEnv.mpContext = NULL; 00374 } 00375 isInited = 0; 00376 gpuInfo->mnIsUserCreated = 0; 00377 free( gpuInfo->mpArryDevsID ); 00378 return 1; 00379 } 00380 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) 00381 { 00382 unsigned int i = 0; 00383 cl_int clStatus; 00384 int status = 0; 00385 char *str = NULL; 00386 FILE *fd = NULL; 00387 char fileName[256] = { 0 }, cl_name[128] = { 0 }; 00388 char deviceName[1024]; 00389 clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL ); 00390 CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); 00391 str = (char*) strstr( clFileName, (char*) ".cl" ); 00392 memcpy( cl_name, clFileName, str - clFileName ); 00393 cl_name[str - clFileName] = '\0'; 00394 sprintf( fileName, "%s-%s.bin", cl_name, deviceName ); 00395 legalizeFileName(fileName); 00396 fd = fopen( fileName, "rb" ); 00397 status = ( fd != NULL ) ? 1 : 0; 00398 if ( fd != NULL ) 00399 { 00400 *fhandle = fd; 00401 } 00402 return status; 00403 00404 } 00405 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ) 00406 { 00407 int i; 00408 for ( i = 0; i < gpuEnvCached->mnFileCount; i++ ) 00409 { 00410 if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 ) 00411 { 00412 if ( gpuEnvCached->mpArryPrograms[i] != NULL ) 00413 { 00414 return 1; 00415 } 00416 } 00417 } 00418 00419 return 0; 00420 } 00421 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ) 00422 { 00423 FILE *output = NULL; 00424 output = fopen( fileName, "wb" ); 00425 if ( output == NULL ) 00426 { 00427 return 0; 00428 } 00429 00430 fwrite( birary, sizeof(char), numBytes, output ); 00431 fclose( output ); 00432 00433 return 1; 00434 00435 } 00436 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName ) 00437 { 00438 unsigned int i = 0; 00439 cl_int clStatus; 00440 size_t *binarySizes, numDevices=0; 00441 cl_device_id *mpArryDevsID; 00442 char **binaries, *str = NULL; 00443 00444 clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, 00445 sizeof(numDevices), &numDevices, NULL ); 00446 CHECK_OPENCL( clStatus, "clGetProgramInfo" ); 00447 00448 mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); 00449 if ( mpArryDevsID == NULL ) 00450 { 00451 return 0; 00452 } 00453 /* grab the handles to all of the devices in the program. */ 00454 clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES, 00455 sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL ); 00456 CHECK_OPENCL( clStatus, "clGetProgramInfo" ); 00457 00458 /* figure out the sizes of each of the binaries. */ 00459 binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices ); 00460 00461 clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, 00462 sizeof(size_t) * numDevices, binarySizes, NULL ); 00463 CHECK_OPENCL( clStatus, "clGetProgramInfo" ); 00464 00465 /* copy over all of the generated binaries. */ 00466 binaries = (char**) malloc( sizeof(char *) * numDevices ); 00467 if ( binaries == NULL ) 00468 { 00469 return 0; 00470 } 00471 00472 for ( i = 0; i < numDevices; i++ ) 00473 { 00474 if ( binarySizes[i] != 0 ) 00475 { 00476 binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] ); 00477 if ( binaries[i] == NULL ) 00478 { 00479 return 0; 00480 } 00481 } 00482 else 00483 { 00484 binaries[i] = NULL; 00485 } 00486 } 00487 00488 clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES, 00489 sizeof(char *) * numDevices, binaries, NULL ); 00490 CHECK_OPENCL(clStatus,"clGetProgramInfo"); 00491 00492 /* dump out each binary into its own separate file. */ 00493 for ( i = 0; i < numDevices; i++ ) 00494 { 00495 char fileName[256] = { 0 }, cl_name[128] = { 0 }; 00496 00497 if ( binarySizes[i] != 0 ) 00498 { 00499 char deviceName[1024]; 00500 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, 00501 sizeof(deviceName), deviceName, NULL); 00502 CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); 00503 00504 str = (char*) strstr( clFileName, (char*) ".cl" ); 00505 memcpy( cl_name, clFileName, str - clFileName ); 00506 cl_name[str - clFileName] = '\0'; 00507 sprintf( fileName, "%s-%s.bin", cl_name, deviceName ); 00508 legalizeFileName(fileName); 00509 if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) ) 00510 { 00511 printf("[OD] write binary[%s] failed\n", fileName); 00512 return 0; 00513 } //else 00514 printf("[OD] write binary[%s] successfully\n", fileName); 00515 } 00516 } 00517 00518 // Release all resouces and memory 00519 for ( i = 0; i < numDevices; i++ ) 00520 { 00521 if ( binaries[i] != NULL ) 00522 { 00523 free( binaries[i] ); 00524 binaries[i] = NULL; 00525 } 00526 } 00527 00528 if ( binaries != NULL ) 00529 { 00530 free( binaries ); 00531 binaries = NULL; 00532 } 00533 00534 if ( binarySizes != NULL ) 00535 { 00536 free( binarySizes ); 00537 binarySizes = NULL; 00538 } 00539 00540 if ( mpArryDevsID != NULL ) 00541 { 00542 free( mpArryDevsID ); 00543 mpArryDevsID = NULL; 00544 } 00545 return 1; 00546 } 00547 00548 void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus ) 00549 { 00550 l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, 00551 nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); 00552 clFinish( rEnv.mpkCmdQueue ); 00553 if (_pValues != NULL) 00554 { 00555 for ( int i = 0; i < (int)nElements; i++ ) 00556 pValues[i] = (l_int32)_pValues[i]; 00557 } 00558 00559 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); 00560 //clFinish( rEnv.mpkCmdQueue ); 00561 return; 00562 } 00563 00564 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) 00565 { 00566 //PERF_COUNT_START("CompileKernelFile") 00567 cl_int clStatus = 0; 00568 size_t length; 00569 char *buildLog = NULL, *binary; 00570 const char *source; 00571 size_t source_size[1]; 00572 int b_error, binary_status, binaryExisted, idx; 00573 size_t numDevices; 00574 cl_device_id *mpArryDevsID; 00575 FILE *fd, *fd1; 00576 const char* filename = "kernel.cl"; 00577 //fprintf(stderr, "[OD] CompileKernelFile ... \n"); 00578 if ( CachedOfKernerPrg(gpuInfo, filename) == 1 ) 00579 { 00580 return 1; 00581 } 00582 00583 idx = gpuInfo->mnFileCount; 00584 00585 source = kernel_src; 00586 00587 source_size[0] = strlen( source ); 00588 binaryExisted = 0; 00589 binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark 00590 //PERF_COUNT_SUB("BinaryGenerated") 00591 if ( binaryExisted == 1 ) 00592 { 00593 clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, 00594 sizeof(numDevices), &numDevices, NULL ); 00595 CHECK_OPENCL( clStatus, "clGetContextInfo" ); 00596 00597 mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); 00598 if ( mpArryDevsID == NULL ) 00599 { 00600 return 0; 00601 } 00602 //PERF_COUNT_SUB("get numDevices") 00603 b_error = 0; 00604 length = 0; 00605 b_error |= fseek( fd, 0, SEEK_END ) < 0; 00606 b_error |= ( length = ftell(fd) ) <= 0; 00607 b_error |= fseek( fd, 0, SEEK_SET ) < 0; 00608 if ( b_error ) 00609 { 00610 return 0; 00611 } 00612 00613 binary = (char*) malloc( length + 2 ); 00614 if ( !binary ) 00615 { 00616 return 0; 00617 } 00618 00619 memset( binary, 0, length + 2 ); 00620 b_error |= fread( binary, 1, length, fd ) != length; 00621 00622 00623 fclose( fd ); 00624 //PERF_COUNT_SUB("read file") 00625 fd = NULL; 00626 // grab the handles to all of the devices in the context. 00627 clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 00628 sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL ); 00629 CHECK_OPENCL( clStatus, "clGetContextInfo" ); 00630 //PERF_COUNT_SUB("get devices") 00631 //fprintf(stderr, "[OD] Create kernel from binary\n"); 00632 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices, 00633 mpArryDevsID, &length, (const unsigned char**) &binary, 00634 &binary_status, &clStatus ); 00635 CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" ); 00636 //PERF_COUNT_SUB("clCreateProgramWithBinary") 00637 free( binary ); 00638 free( mpArryDevsID ); 00639 mpArryDevsID = NULL; 00640 //PERF_COUNT_SUB("binaryExisted") 00641 } 00642 else 00643 { 00644 // create a CL program using the kernel source 00645 //fprintf(stderr, "[OD] Create kernel from source\n"); 00646 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source, 00647 source_size, &clStatus); 00648 CHECK_OPENCL( clStatus, "clCreateProgramWithSource" ); 00649 //PERF_COUNT_SUB("!binaryExisted") 00650 } 00651 00652 if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL ) 00653 { 00654 return 0; 00655 } 00656 00657 //char options[512]; 00658 // create a cl program executable for all the devices specified 00659 //printf("[OD] BuildProgram.\n"); 00660 PERF_COUNT_START("OD::CompileKernel::clBuildProgram") 00661 if (!gpuInfo->mnIsUserCreated) 00662 { 00663 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, 00664 buildOption, NULL, NULL); 00665 //PERF_COUNT_SUB("clBuildProgram notUserCreated") 00666 } 00667 else 00668 { 00669 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), 00670 buildOption, NULL, NULL); 00671 //PERF_COUNT_SUB("clBuildProgram isUserCreated") 00672 } 00673 PERF_COUNT_END 00674 if ( clStatus != CL_SUCCESS ) 00675 { 00676 printf ("BuildProgram error!\n"); 00677 if ( !gpuInfo->mnIsUserCreated ) 00678 { 00679 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], 00680 CL_PROGRAM_BUILD_LOG, 0, NULL, &length ); 00681 } 00682 else 00683 { 00684 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, 00685 CL_PROGRAM_BUILD_LOG, 0, NULL, &length); 00686 } 00687 if ( clStatus != CL_SUCCESS ) 00688 { 00689 printf("opencl create build log fail\n"); 00690 return 0; 00691 } 00692 buildLog = (char*) malloc( length ); 00693 if ( buildLog == (char*) NULL ) 00694 { 00695 return 0; 00696 } 00697 if ( !gpuInfo->mnIsUserCreated ) 00698 { 00699 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], 00700 CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); 00701 } 00702 else 00703 { 00704 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, 00705 CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); 00706 } 00707 if ( clStatus != CL_SUCCESS ) 00708 { 00709 printf("opencl program build info fail\n"); 00710 return 0; 00711 } 00712 00713 fd1 = fopen( "kernel-build.log", "w+" ); 00714 if ( fd1 != NULL ) 00715 { 00716 fwrite( buildLog, sizeof(char), length, fd1 ); 00717 fclose( fd1 ); 00718 } 00719 00720 free( buildLog ); 00721 //PERF_COUNT_SUB("build error log") 00722 return 0; 00723 } 00724 00725 strcpy( gpuInfo->mArryKnelSrcFile[idx], filename ); 00726 //PERF_COUNT_SUB("strcpy") 00727 if ( binaryExisted == 0 ) { 00728 GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename ); 00729 PERF_COUNT_SUB("GenerateBinFromKernelSource") 00730 } 00731 00732 gpuInfo->mnFileCount += 1; 00733 //PERF_COUNT_END 00734 return 1; 00735 } 00736 00737 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line) 00738 { 00739 PERF_COUNT_START("pixReadFromTiffKernel") 00740 cl_int clStatus; 00741 KernelEnv rEnv; 00742 size_t globalThreads[2]; 00743 size_t localThreads[2]; 00744 int gsize; 00745 cl_mem valuesCl; 00746 cl_mem outputCl; 00747 00748 //global and local work dimensions for Horizontal pass 00749 gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 00750 globalThreads[0] = gsize; 00751 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 00752 globalThreads[1] = gsize; 00753 localThreads[0] = GROUPSIZE_X; 00754 localThreads[1] = GROUPSIZE_Y; 00755 00756 SetKernelEnv( &rEnv ); 00757 00758 l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32)); 00759 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus ); 00760 CHECK_OPENCL( clStatus, "clCreateKernel"); 00761 00762 //Allocate input and output OCL buffers 00763 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); 00764 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); 00765 00766 //Kernel arguments 00767 clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl ); 00768 CHECK_OPENCL( clStatus, "clSetKernelArg"); 00769 clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w ); 00770 CHECK_OPENCL( clStatus, "clSetKernelArg" ); 00771 clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h ); 00772 CHECK_OPENCL( clStatus, "clSetKernelArg" ); 00773 clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl ); 00774 CHECK_OPENCL( clStatus, "clSetKernelArg" ); 00775 clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl ); 00776 CHECK_OPENCL( clStatus, "clSetKernelArg"); 00777 00778 //Kernel enqueue 00779 PERF_COUNT_SUB("before") 00780 clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL ); 00781 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); 00782 00783 /* map results back from gpu */ 00784 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus); 00785 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl"); 00786 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL); 00787 00788 //Sync 00789 clFinish( rEnv.mpkCmdQueue ); 00790 PERF_COUNT_SUB("kernel & map") 00791 PERF_COUNT_END 00792 return pResult; 00793 } 00794 00795 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n ) 00796 { 00797 PERF_COUNT_START("pixReadTiffCL") 00798 FILE *fp; 00799 PIX *pix; 00800 00801 //printf("pixReadTiffCl file"); 00802 PROCNAME("pixReadTiff"); 00803 00804 if (!filename) 00805 return (PIX *)ERROR_PTR("filename not defined", procName, NULL); 00806 00807 if ((fp = fopenReadStream(filename)) == NULL) 00808 return (PIX *)ERROR_PTR("image file not found", procName, NULL); 00809 if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) { 00810 fclose(fp); 00811 return (PIX *)ERROR_PTR("pix not read", procName, NULL); 00812 } 00813 fclose(fp); 00814 PERF_COUNT_END 00815 return pix; 00816 00817 } 00818 TIFF * 00819 OpenclDevice::fopenTiffCl(FILE *fp, 00820 const char *modestring) 00821 { 00822 l_int32 fd; 00823 00824 PROCNAME("fopenTiff"); 00825 00826 if (!fp) 00827 return (TIFF *)ERROR_PTR("stream not opened", procName, NULL); 00828 if (!modestring) 00829 return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL); 00830 00831 if ((fd = fileno(fp)) < 0) 00832 return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL); 00833 lseek(fd, 0, SEEK_SET); 00834 00835 return TIFFFdOpen(fd, "TIFFstream", modestring); 00836 } 00837 l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF *tif, 00838 l_int32 *pxres, 00839 l_int32 *pyres) 00840 { 00841 l_uint16 resunit; 00842 l_int32 foundxres, foundyres; 00843 l_float32 fxres, fyres; 00844 00845 PROCNAME("getTiffStreamResolution"); 00846 00847 if (!tif) 00848 return ERROR_INT("tif not opened", procName, 1); 00849 if (!pxres || !pyres) 00850 return ERROR_INT("&xres and &yres not both defined", procName, 1); 00851 *pxres = *pyres = 0; 00852 00853 TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit); 00854 foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres); 00855 foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres); 00856 if (!foundxres && !foundyres) return 1; 00857 if (!foundxres && foundyres) 00858 fxres = fyres; 00859 else if (foundxres && !foundyres) 00860 fyres = fxres; 00861 00862 if (resunit == RESUNIT_CENTIMETER) { /* convert to ppi */ 00863 *pxres = (l_int32)(2.54 * fxres + 0.5); 00864 *pyres = (l_int32)(2.54 * fyres + 0.5); 00865 } 00866 else { 00867 *pxres = (l_int32)fxres; 00868 *pyres = (l_int32)fyres; 00869 } 00870 00871 return 0; 00872 } 00873 00874 struct L_Memstream 00875 { 00876 l_uint8 *buffer; /* expands to hold data when written to; */ 00877 /* fixed size when read from. */ 00878 size_t bufsize; /* current size allocated when written to; */ 00879 /* fixed size of input data when read from. */ 00880 size_t offset; /* byte offset from beginning of buffer. */ 00881 size_t hw; /* high-water mark; max bytes in buffer. */ 00882 l_uint8 **poutdata; /* input param for writing; data goes here. */ 00883 size_t *poutsize; /* input param for writing; data size goes here. */ 00884 }; 00885 typedef struct L_Memstream L_MEMSTREAM; 00886 00887 /* These are static functions for memory I/O */ 00888 static L_MEMSTREAM *memstreamCreateForRead(l_uint8 *indata, size_t pinsize); 00889 static L_MEMSTREAM *memstreamCreateForWrite(l_uint8 **poutdata, 00890 size_t *poutsize); 00891 static tsize_t tiffReadCallback(thandle_t handle, tdata_t data, tsize_t length); 00892 static tsize_t tiffWriteCallback(thandle_t handle, tdata_t data, 00893 tsize_t length); 00894 static toff_t tiffSeekCallback(thandle_t handle, toff_t offset, l_int32 whence); 00895 static l_int32 tiffCloseCallback(thandle_t handle); 00896 static toff_t tiffSizeCallback(thandle_t handle); 00897 static l_int32 tiffMapCallback(thandle_t handle, tdata_t *data, toff_t *length); 00898 static void tiffUnmapCallback(thandle_t handle, tdata_t data, toff_t length); 00899 00900 00901 static L_MEMSTREAM * 00902 memstreamCreateForRead(l_uint8 *indata, 00903 size_t insize) 00904 { 00905 L_MEMSTREAM *mstream; 00906 00907 mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM)); 00908 mstream->buffer = indata; /* handle to input data array */ 00909 mstream->bufsize = insize; /* amount of input data */ 00910 mstream->hw = insize; /* high-water mark fixed at input data size */ 00911 mstream->offset = 0; /* offset always starts at 0 */ 00912 return mstream; 00913 } 00914 00915 00916 static L_MEMSTREAM * 00917 memstreamCreateForWrite(l_uint8 **poutdata, 00918 size_t *poutsize) 00919 { 00920 L_MEMSTREAM *mstream; 00921 00922 mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM)); 00923 mstream->buffer = (l_uint8 *)CALLOC(8 * 1024, 1); 00924 mstream->bufsize = 8 * 1024; 00925 mstream->poutdata = poutdata; /* used only at end of write */ 00926 mstream->poutsize = poutsize; /* ditto */ 00927 mstream->hw = mstream->offset = 0; 00928 return mstream; 00929 } 00930 00931 00932 static tsize_t 00933 tiffReadCallback(thandle_t handle, 00934 tdata_t data, 00935 tsize_t length) 00936 { 00937 L_MEMSTREAM *mstream; 00938 size_t amount; 00939 00940 mstream = (L_MEMSTREAM *)handle; 00941 amount = L_MIN((size_t)length, mstream->hw - mstream->offset); 00942 memcpy(data, mstream->buffer + mstream->offset, amount); 00943 mstream->offset += amount; 00944 return amount; 00945 } 00946 00947 00948 static tsize_t 00949 tiffWriteCallback(thandle_t handle, 00950 tdata_t data, 00951 tsize_t length) 00952 { 00953 L_MEMSTREAM *mstream; 00954 size_t newsize; 00955 00956 /* reallocNew() uses calloc to initialize the array. 00957 * If malloc is used instead, for some of the encoding methods, 00958 * not all the data in 'bufsize' bytes in the buffer will 00959 * have been initialized by the end of the compression. */ 00960 mstream = (L_MEMSTREAM *)handle; 00961 if (mstream->offset + length > mstream->bufsize) { 00962 newsize = 2 * (mstream->offset + length); 00963 mstream->buffer = (l_uint8 *)reallocNew((void **)&mstream->buffer, 00964 mstream->offset, newsize); 00965 mstream->bufsize = newsize; 00966 } 00967 00968 memcpy(mstream->buffer + mstream->offset, data, length); 00969 mstream->offset += length; 00970 mstream->hw = L_MAX(mstream->offset, mstream->hw); 00971 return length; 00972 } 00973 00974 00975 static toff_t 00976 tiffSeekCallback(thandle_t handle, 00977 toff_t offset, 00978 l_int32 whence) 00979 { 00980 L_MEMSTREAM *mstream; 00981 00982 PROCNAME("tiffSeekCallback"); 00983 mstream = (L_MEMSTREAM *)handle; 00984 switch (whence) { 00985 case SEEK_SET: 00986 /* fprintf(stderr, "seek_set: offset = %d\n", offset); */ 00987 mstream->offset = offset; 00988 break; 00989 case SEEK_CUR: 00990 /* fprintf(stderr, "seek_cur: offset = %d\n", offset); */ 00991 mstream->offset += offset; 00992 break; 00993 case SEEK_END: 00994 /* fprintf(stderr, "seek end: hw = %d, offset = %d\n", 00995 mstream->hw, offset); */ 00996 mstream->offset = mstream->hw - offset; /* offset >= 0 */ 00997 break; 00998 default: 00999 return (toff_t)ERROR_INT("bad whence value", procName, 01000 mstream->offset); 01001 } 01002 01003 return mstream->offset; 01004 } 01005 01006 01007 static l_int32 01008 tiffCloseCallback(thandle_t handle) 01009 { 01010 L_MEMSTREAM *mstream; 01011 01012 mstream = (L_MEMSTREAM *)handle; 01013 if (mstream->poutdata) { /* writing: save the output data */ 01014 *mstream->poutdata = mstream->buffer; 01015 *mstream->poutsize = mstream->hw; 01016 } 01017 FREE(mstream); /* never free the buffer! */ 01018 return 0; 01019 } 01020 01021 01022 static toff_t 01023 tiffSizeCallback(thandle_t handle) 01024 { 01025 L_MEMSTREAM *mstream; 01026 01027 mstream = (L_MEMSTREAM *)handle; 01028 return mstream->hw; 01029 } 01030 01031 01032 static l_int32 01033 tiffMapCallback(thandle_t handle, 01034 tdata_t *data, 01035 toff_t *length) 01036 { 01037 L_MEMSTREAM *mstream; 01038 01039 mstream = (L_MEMSTREAM *)handle; 01040 *data = mstream->buffer; 01041 *length = mstream->hw; 01042 return 0; 01043 } 01044 01045 01046 static void 01047 tiffUnmapCallback(thandle_t handle, 01048 tdata_t data, 01049 toff_t length) 01050 { 01051 return; 01052 } 01053 01054 01071 static TIFF * 01072 fopenTiffMemstream(const char *filename, 01073 const char *operation, 01074 l_uint8 **pdata, 01075 size_t *pdatasize) 01076 { 01077 L_MEMSTREAM *mstream; 01078 01079 PROCNAME("fopenTiffMemstream"); 01080 01081 if (!filename) 01082 return (TIFF *)ERROR_PTR("filename not defined", procName, NULL); 01083 if (!operation) 01084 return (TIFF *)ERROR_PTR("operation not defined", procName, NULL); 01085 if (!pdata) 01086 return (TIFF *)ERROR_PTR("&data not defined", procName, NULL); 01087 if (!pdatasize) 01088 return (TIFF *)ERROR_PTR("&datasize not defined", procName, NULL); 01089 if (!strcmp(operation, "r") && !strcmp(operation, "w")) 01090 return (TIFF *)ERROR_PTR("operation not 'r' or 'w'}", procName, NULL); 01091 01092 if (!strcmp(operation, "r")) 01093 mstream = memstreamCreateForRead(*pdata, *pdatasize); 01094 else 01095 mstream = memstreamCreateForWrite(pdata, pdatasize); 01096 01097 return TIFFClientOpen(filename, operation, mstream, 01098 tiffReadCallback, tiffWriteCallback, 01099 tiffSeekCallback, tiffCloseCallback, 01100 tiffSizeCallback, tiffMapCallback, 01101 tiffUnmapCallback); 01102 } 01103 01104 01105 01106 PIX * 01107 OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32 n) 01108 { 01109 l_int32 i, pagefound; 01110 PIX *pix; 01111 TIFF *tif; 01112 //L_MEMSTREAM *memStream; 01113 PROCNAME("pixReadMemTiffCl"); 01114 01115 if (!data) 01116 return (PIX *)ERROR_PTR("data pointer is NULL", procName, NULL); 01117 01118 if ((tif = fopenTiffMemstream("", "r", (l_uint8 **)&data, &size)) == NULL) 01119 return (PIX *)ERROR_PTR("tif not opened", procName, NULL); 01120 01121 pagefound = FALSE; 01122 pix = NULL; 01123 for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) { 01124 if (i == n) { 01125 pagefound = TRUE; 01126 if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) { 01127 TIFFCleanup(tif); 01128 return (PIX *)ERROR_PTR("pix not read", procName, NULL); 01129 } 01130 break; 01131 } 01132 if (TIFFReadDirectory(tif) == 0) 01133 break; 01134 } 01135 01136 if (pagefound == FALSE) { 01137 L_WARNING("tiff page %d not found", procName); 01138 TIFFCleanup(tif); 01139 return NULL; 01140 } 01141 01142 TIFFCleanup(tif); 01143 return pix; 01144 } 01145 01146 PIX * 01147 OpenclDevice::pixReadStreamTiffCl(FILE *fp, 01148 l_int32 n) 01149 { 01150 l_int32 i, pagefound; 01151 PIX *pix; 01152 TIFF *tif; 01153 01154 PROCNAME("pixReadStreamTiff"); 01155 01156 if (!fp) 01157 return (PIX *)ERROR_PTR("stream not defined", procName, NULL); 01158 01159 if ((tif = fopenTiffCl(fp, "rb")) == NULL) 01160 return (PIX *)ERROR_PTR("tif not opened", procName, NULL); 01161 01162 pagefound = FALSE; 01163 pix = NULL; 01164 for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) { 01165 if (i == n) { 01166 pagefound = TRUE; 01167 if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) { 01168 TIFFCleanup(tif); 01169 return (PIX *)ERROR_PTR("pix not read", procName, NULL); 01170 } 01171 break; 01172 } 01173 if (TIFFReadDirectory(tif) == 0) 01174 break; 01175 } 01176 01177 if (pagefound == FALSE) { 01178 L_WARNING("tiff page %d not found", procName, n); 01179 TIFFCleanup(tif); 01180 return NULL; 01181 } 01182 01183 TIFFCleanup(tif); 01184 return pix; 01185 } 01186 01187 static l_int32 01188 getTiffCompressedFormat(l_uint16 tiffcomp) 01189 { 01190 l_int32 comptype; 01191 01192 switch (tiffcomp) 01193 { 01194 case COMPRESSION_CCITTFAX4: 01195 comptype = IFF_TIFF_G4; 01196 break; 01197 case COMPRESSION_CCITTFAX3: 01198 comptype = IFF_TIFF_G3; 01199 break; 01200 case COMPRESSION_CCITTRLE: 01201 comptype = IFF_TIFF_RLE; 01202 break; 01203 case COMPRESSION_PACKBITS: 01204 comptype = IFF_TIFF_PACKBITS; 01205 break; 01206 case COMPRESSION_LZW: 01207 comptype = IFF_TIFF_LZW; 01208 break; 01209 case COMPRESSION_ADOBE_DEFLATE: 01210 comptype = IFF_TIFF_ZIP; 01211 break; 01212 default: 01213 comptype = IFF_TIFF; 01214 break; 01215 } 01216 return comptype; 01217 } 01218 01219 void compare(l_uint32 *cpu, l_uint32 *gpu,int size) 01220 { 01221 for(int i=0;i<size;i++) 01222 { 01223 if(cpu[i]!=gpu[i]) 01224 { 01225 printf("\ndoesnot match\n"); 01226 return; 01227 } 01228 } 01229 printf("\nit matches\n"); 01230 01231 } 01232 01233 //OpenCL implementation of pixReadFromTiffStream. 01234 //Similar to the CPU implentation of pixReadFromTiffStream 01235 PIX * 01236 OpenclDevice::pixReadFromTiffStreamCl(TIFF *tif) 01237 { 01238 l_uint8 *linebuf, *data; 01239 l_uint16 spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation; 01240 l_uint16 *redmap, *greenmap, *bluemap; 01241 l_int32 d, wpl, bpl, comptype, i, ncolors; 01242 l_int32 xres, yres; 01243 l_uint32 w, h; 01244 l_uint32 *line, *tiffdata; 01245 PIX *pix; 01246 PIXCMAP *cmap; 01247 01248 PROCNAME("pixReadFromTiffStream"); 01249 01250 if (!tif) 01251 return (PIX *)ERROR_PTR("tif not defined", procName, NULL); 01252 01253 01254 TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps); 01255 TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp); 01256 bpp = bps * spp; 01257 if (bpp > 32) 01258 return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL); 01259 if (spp == 1) 01260 d = bps; 01261 else if (spp == 3 || spp == 4) 01262 d = 32; 01263 else 01264 return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL); 01265 01266 TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w); 01267 TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h); 01268 tiffbpl = TIFFScanlineSize(tif); 01269 01270 if ((pix = pixCreate(w, h, d)) == NULL) 01271 return (PIX *)ERROR_PTR("pix not made", procName, NULL); 01272 data = (l_uint8 *)pixGetData(pix); 01273 wpl = pixGetWpl(pix); 01274 bpl = 4 * wpl; 01275 01276 01277 if (spp == 1) { 01278 if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL) 01279 return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL); 01280 01281 for (i = 0 ; i < h ; i++) { 01282 if (TIFFReadScanline(tif, linebuf, i, 0) < 0) { 01283 FREE(linebuf); 01284 pixDestroy(&pix); 01285 return (PIX *)ERROR_PTR("line read fail", procName, NULL); 01286 } 01287 memcpy((char *)data, (char *)linebuf, tiffbpl); 01288 data += bpl; 01289 } 01290 if (bps <= 8) 01291 pixEndianByteSwap(pix); 01292 else 01293 pixEndianTwoByteSwap(pix); 01294 FREE(linebuf); 01295 } 01296 else { 01297 if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) { 01298 pixDestroy(&pix); 01299 return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL); 01300 } 01301 if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata, 01302 ORIENTATION_TOPLEFT, 0)) { 01303 FREE(tiffdata); 01304 pixDestroy(&pix); 01305 return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL); 01306 } 01307 line = pixGetData(pix); 01308 01309 //Invoke the OpenCL kernel for pixReadFromTiff 01310 l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line); 01311 01312 pixSetData(pix, output_gpu); 01313 // pix already has data allocated, it now points to output_gpu? 01314 FREE(tiffdata); 01315 FREE(line); 01316 //FREE(output_gpu); 01317 } 01318 01319 if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) { 01320 pixSetXRes(pix, xres); 01321 pixSetYRes(pix, yres); 01322 } 01323 01324 01325 TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp); 01326 comptype = getTiffCompressedFormat(tiffcomp); 01327 pixSetInputFormat(pix, comptype); 01328 01329 if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) { 01330 01331 if ((cmap = pixcmapCreate(bps)) == NULL) { 01332 pixDestroy(&pix); 01333 return (PIX *)ERROR_PTR("cmap not made", procName, NULL); 01334 } 01335 ncolors = 1 << bps; 01336 for (i = 0; i < ncolors; i++) 01337 pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8, 01338 bluemap[i] >> 8); 01339 pixSetColormap(pix, cmap); 01340 } 01341 else { 01342 if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) { 01343 01344 if (tiffcomp == COMPRESSION_CCITTFAX3 || 01345 tiffcomp == COMPRESSION_CCITTFAX4 || 01346 tiffcomp == COMPRESSION_CCITTRLE || 01347 tiffcomp == COMPRESSION_CCITTRLEW) { 01348 photometry = PHOTOMETRIC_MINISWHITE; 01349 } 01350 else 01351 photometry = PHOTOMETRIC_MINISBLACK; 01352 } 01353 if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) || 01354 (d == 8 && photometry == PHOTOMETRIC_MINISWHITE)) 01355 pixInvert(pix, pix); 01356 } 01357 01358 if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) { 01359 if (orientation >= 1 && orientation <= 8) { 01360 struct tiff_transform *transform = 01361 &tiff_orientation_transforms[orientation - 1]; 01362 if (transform->vflip) pixFlipTB(pix, pix); 01363 if (transform->hflip) pixFlipLR(pix, pix); 01364 if (transform->rotate) { 01365 PIX *oldpix = pix; 01366 pix = pixRotate90(oldpix, transform->rotate); 01367 pixDestroy(&oldpix); 01368 } 01369 } 01370 } 01371 01372 return pix; 01373 } 01374 01375 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels 01376 cl_int 01377 pixDilateCL_55(l_int32 wpl, l_int32 h) 01378 { 01379 size_t globalThreads[2]; 01380 cl_mem pixtemp; 01381 cl_int status; 01382 int gsize; 01383 size_t localThreads[2]; 01384 01385 //Horizontal pass 01386 gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; 01387 globalThreads[0] = gsize; 01388 globalThreads[1] = GROUPSIZE_HMORY; 01389 localThreads[0] = GROUPSIZE_HMORX; 01390 localThreads[1] = GROUPSIZE_HMORY; 01391 01392 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status ); 01393 01394 status = clSetKernelArg(rEnv.mpkKernel, 01395 0, 01396 sizeof(cl_mem), 01397 &pixsCLBuffer); 01398 status = clSetKernelArg(rEnv.mpkKernel, 01399 1, 01400 sizeof(cl_mem), 01401 &pixdCLBuffer); 01402 status = clSetKernelArg(rEnv.mpkKernel, 01403 2, 01404 sizeof(wpl), 01405 (const void *)&wpl); 01406 status = clSetKernelArg(rEnv.mpkKernel, 01407 3, 01408 sizeof(h), 01409 (const void *)&h); 01410 01411 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01412 rEnv.mpkKernel, 01413 2, 01414 NULL, 01415 globalThreads, 01416 localThreads, 01417 0, 01418 NULL, 01419 NULL); 01420 01421 //Swap source and dest buffers 01422 pixtemp = pixsCLBuffer; 01423 pixsCLBuffer = pixdCLBuffer; 01424 pixdCLBuffer = pixtemp; 01425 01426 //Vertical 01427 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01428 globalThreads[0] = gsize; 01429 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01430 globalThreads[1] = gsize; 01431 localThreads[0] = GROUPSIZE_X; 01432 localThreads[1] = GROUPSIZE_Y; 01433 01434 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status ); 01435 01436 status = clSetKernelArg(rEnv.mpkKernel, 01437 0, 01438 sizeof(cl_mem), 01439 &pixsCLBuffer); 01440 status = clSetKernelArg(rEnv.mpkKernel, 01441 1, 01442 sizeof(cl_mem), 01443 &pixdCLBuffer); 01444 status = clSetKernelArg(rEnv.mpkKernel, 01445 2, 01446 sizeof(wpl), 01447 (const void *)&wpl); 01448 status = clSetKernelArg(rEnv.mpkKernel, 01449 3, 01450 sizeof(h), 01451 (const void *)&h); 01452 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01453 rEnv.mpkKernel, 01454 2, 01455 NULL, 01456 globalThreads, 01457 localThreads, 01458 0, 01459 NULL, 01460 NULL); 01461 01462 return status; 01463 } 01464 01465 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels 01466 cl_int 01467 pixErodeCL_55(l_int32 wpl, l_int32 h) 01468 { 01469 size_t globalThreads[2]; 01470 cl_mem pixtemp; 01471 cl_int status; 01472 int gsize; 01473 l_uint32 fwmask, lwmask; 01474 size_t localThreads[2]; 01475 01476 lwmask = lmask32[32 - 2]; 01477 fwmask = rmask32[32 - 2]; 01478 01479 //Horizontal pass 01480 gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; 01481 globalThreads[0] = gsize; 01482 globalThreads[1] = GROUPSIZE_HMORY; 01483 localThreads[0] = GROUPSIZE_HMORX; 01484 localThreads[1] = GROUPSIZE_HMORY; 01485 01486 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status ); 01487 01488 status = clSetKernelArg(rEnv.mpkKernel, 01489 0, 01490 sizeof(cl_mem), 01491 &pixsCLBuffer); 01492 status = clSetKernelArg(rEnv.mpkKernel, 01493 1, 01494 sizeof(cl_mem), 01495 &pixdCLBuffer); 01496 status = clSetKernelArg(rEnv.mpkKernel, 01497 2, 01498 sizeof(wpl), 01499 (const void *)&wpl); 01500 status = clSetKernelArg(rEnv.mpkKernel, 01501 3, 01502 sizeof(h), 01503 (const void *)&h); 01504 01505 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01506 rEnv.mpkKernel, 01507 2, 01508 NULL, 01509 globalThreads, 01510 localThreads, 01511 0, 01512 NULL, 01513 NULL); 01514 01515 //Swap source and dest buffers 01516 pixtemp = pixsCLBuffer; 01517 pixsCLBuffer = pixdCLBuffer; 01518 pixdCLBuffer = pixtemp; 01519 01520 //Vertical 01521 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01522 globalThreads[0] = gsize; 01523 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01524 globalThreads[1] = gsize; 01525 localThreads[0] = GROUPSIZE_X; 01526 localThreads[1] = GROUPSIZE_Y; 01527 01528 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status ); 01529 01530 status = clSetKernelArg(rEnv.mpkKernel, 01531 0, 01532 sizeof(cl_mem), 01533 &pixsCLBuffer); 01534 status = clSetKernelArg(rEnv.mpkKernel, 01535 1, 01536 sizeof(cl_mem), 01537 &pixdCLBuffer); 01538 status = clSetKernelArg(rEnv.mpkKernel, 01539 2, 01540 sizeof(wpl), 01541 (const void *)&wpl); 01542 status = clSetKernelArg(rEnv.mpkKernel, 01543 3, 01544 sizeof(h), 01545 (const void *)&h); 01546 status = clSetKernelArg(rEnv.mpkKernel, 01547 4, 01548 sizeof(fwmask), 01549 (const void *)&fwmask); 01550 status = clSetKernelArg(rEnv.mpkKernel, 01551 5, 01552 sizeof(lwmask), 01553 (const void *)&lwmask); 01554 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01555 rEnv.mpkKernel, 01556 2, 01557 NULL, 01558 globalThreads, 01559 localThreads, 01560 0, 01561 NULL, 01562 NULL); 01563 01564 return status; 01565 } 01566 01567 //Morphology Dilate operation. Invokes the relevant OpenCL kernels 01568 cl_int 01569 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) 01570 { 01571 l_int32 xp, yp, xn, yn; 01572 SEL* sel; 01573 size_t globalThreads[2]; 01574 cl_mem pixtemp; 01575 cl_int status; 01576 int gsize; 01577 size_t localThreads[2]; 01578 char isEven; 01579 01580 OpenclDevice::SetKernelEnv( &rEnv ); 01581 01582 if (hsize == 5 && vsize == 5) 01583 { 01584 //Specific case for 5x5 01585 status = pixDilateCL_55(wpl, h); 01586 return status; 01587 } 01588 01589 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); 01590 01591 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); 01592 selDestroy(&sel); 01593 //global and local work dimensions for Horizontal pass 01594 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01595 globalThreads[0] = gsize; 01596 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01597 globalThreads[1] = gsize; 01598 localThreads[0] = GROUPSIZE_X; 01599 localThreads[1] = GROUPSIZE_Y; 01600 01601 if (xp > 31 || xn > 31) 01602 { 01603 //Generic case. 01604 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status ); 01605 01606 status = clSetKernelArg(rEnv.mpkKernel, 01607 0, 01608 sizeof(cl_mem), 01609 &pixsCLBuffer); 01610 status = clSetKernelArg(rEnv.mpkKernel, 01611 1, 01612 sizeof(cl_mem), 01613 &pixdCLBuffer); 01614 status = clSetKernelArg(rEnv.mpkKernel, 01615 2, 01616 sizeof(xp), 01617 (const void *)&xp); 01618 status = clSetKernelArg(rEnv.mpkKernel, 01619 3, 01620 sizeof(xn), 01621 (const void *)&xn); 01622 status = clSetKernelArg(rEnv.mpkKernel, 01623 4, 01624 sizeof(wpl), 01625 (const void *)&wpl); 01626 status = clSetKernelArg(rEnv.mpkKernel, 01627 5, 01628 sizeof(h), 01629 (const void *)&h); 01630 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01631 rEnv.mpkKernel, 01632 2, 01633 NULL, 01634 globalThreads, 01635 localThreads, 01636 0, 01637 NULL, 01638 NULL); 01639 01640 if (yp > 0 || yn > 0) 01641 { 01642 pixtemp = pixsCLBuffer; 01643 pixsCLBuffer = pixdCLBuffer; 01644 pixdCLBuffer = pixtemp; 01645 } 01646 } 01647 else if (xp > 0 || xn > 0 ) 01648 { 01649 //Specific Horizontal pass kernel for half width < 32 01650 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status ); 01651 isEven = (xp != xn); 01652 01653 status = clSetKernelArg(rEnv.mpkKernel, 01654 0, 01655 sizeof(cl_mem), 01656 &pixsCLBuffer); 01657 status = clSetKernelArg(rEnv.mpkKernel, 01658 1, 01659 sizeof(cl_mem), 01660 &pixdCLBuffer); 01661 status = clSetKernelArg(rEnv.mpkKernel, 01662 2, 01663 sizeof(xp), 01664 (const void *)&xp); 01665 status = clSetKernelArg(rEnv.mpkKernel, 01666 3, 01667 sizeof(wpl), 01668 (const void *)&wpl); 01669 status = clSetKernelArg(rEnv.mpkKernel, 01670 4, 01671 sizeof(h), 01672 (const void *)&h); 01673 status = clSetKernelArg(rEnv.mpkKernel, 01674 5, 01675 sizeof(isEven), 01676 (const void *)&isEven); 01677 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01678 rEnv.mpkKernel, 01679 2, 01680 NULL, 01681 globalThreads, 01682 localThreads, 01683 0, 01684 NULL, 01685 NULL); 01686 01687 if (yp > 0 || yn > 0) 01688 { 01689 pixtemp = pixsCLBuffer; 01690 pixsCLBuffer = pixdCLBuffer; 01691 pixdCLBuffer = pixtemp; 01692 } 01693 } 01694 01695 if (yp > 0 || yn > 0) 01696 { 01697 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status ); 01698 01699 status = clSetKernelArg(rEnv.mpkKernel, 01700 0, 01701 sizeof(cl_mem), 01702 &pixsCLBuffer); 01703 status = clSetKernelArg(rEnv.mpkKernel, 01704 1, 01705 sizeof(cl_mem), 01706 &pixdCLBuffer); 01707 status = clSetKernelArg(rEnv.mpkKernel, 01708 2, 01709 sizeof(yp), 01710 (const void *)&yp); 01711 status = clSetKernelArg(rEnv.mpkKernel, 01712 3, 01713 sizeof(wpl), 01714 (const void *)&wpl); 01715 status = clSetKernelArg(rEnv.mpkKernel, 01716 4, 01717 sizeof(h), 01718 (const void *)&h); 01719 status = clSetKernelArg(rEnv.mpkKernel, 01720 5, 01721 sizeof(yn), 01722 (const void *)&yn); 01723 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01724 rEnv.mpkKernel, 01725 2, 01726 NULL, 01727 globalThreads, 01728 localThreads, 01729 0, 01730 NULL, 01731 NULL); 01732 } 01733 01734 01735 return status; 01736 } 01737 01738 //Morphology Erode operation. Invokes the relevant OpenCL kernels 01739 cl_int 01740 pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) 01741 { 01742 01743 l_int32 xp, yp, xn, yn; 01744 SEL* sel; 01745 size_t globalThreads[2]; 01746 size_t localThreads[2]; 01747 cl_mem pixtemp; 01748 cl_int status; 01749 int gsize; 01750 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC); 01751 l_uint32 rwmask, lwmask; 01752 char isEven; 01753 01754 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); 01755 01756 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); 01757 selDestroy(&sel); 01758 OpenclDevice::SetKernelEnv( &rEnv ); 01759 01760 if (hsize == 5 && vsize == 5 && isAsymmetric) 01761 { 01762 //Specific kernel for 5x5 01763 status = pixErodeCL_55(wpl, h); 01764 return status; 01765 } 01766 01767 rwmask = rmask32[32 - (xp & 31)]; 01768 lwmask = lmask32[32 - (xn & 31)]; 01769 01770 //global and local work dimensions for Horizontal pass 01771 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01772 globalThreads[0] = gsize; 01773 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01774 globalThreads[1] = gsize; 01775 localThreads[0] = GROUPSIZE_X; 01776 localThreads[1] = GROUPSIZE_Y; 01777 01778 //Horizontal Pass 01779 if (xp > 31 || xn > 31 ) 01780 { 01781 //Generic case. 01782 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status ); 01783 01784 status = clSetKernelArg(rEnv.mpkKernel, 01785 0, 01786 sizeof(cl_mem), 01787 &pixsCLBuffer); 01788 status = clSetKernelArg(rEnv.mpkKernel, 01789 1, 01790 sizeof(cl_mem), 01791 &pixdCLBuffer); 01792 status = clSetKernelArg(rEnv.mpkKernel, 01793 2, 01794 sizeof(xp), 01795 (const void *)&xp); 01796 status = clSetKernelArg(rEnv.mpkKernel, 01797 3, 01798 sizeof(xn), 01799 (const void *)&xn); 01800 status = clSetKernelArg(rEnv.mpkKernel, 01801 4, 01802 sizeof(wpl), 01803 (const void *)&wpl); 01804 status = clSetKernelArg(rEnv.mpkKernel, 01805 5, 01806 sizeof(h), 01807 (const void *)&h); 01808 status = clSetKernelArg(rEnv.mpkKernel, 01809 6, 01810 sizeof(isAsymmetric), 01811 (const void *)&isAsymmetric); 01812 status = clSetKernelArg(rEnv.mpkKernel, 01813 7, 01814 sizeof(rwmask), 01815 (const void *)&rwmask); 01816 status = clSetKernelArg(rEnv.mpkKernel, 01817 8, 01818 sizeof(lwmask), 01819 (const void *)&lwmask); 01820 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01821 rEnv.mpkKernel, 01822 2, 01823 NULL, 01824 globalThreads, 01825 localThreads, 01826 0, 01827 NULL, 01828 NULL); 01829 01830 if (yp > 0 || yn > 0) 01831 { 01832 pixtemp = pixsCLBuffer; 01833 pixsCLBuffer = pixdCLBuffer; 01834 pixdCLBuffer = pixtemp; 01835 } 01836 } 01837 else if (xp > 0 || xn > 0) 01838 { 01839 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status ); 01840 isEven = (xp != xn); 01841 01842 status = clSetKernelArg(rEnv.mpkKernel, 01843 0, 01844 sizeof(cl_mem), 01845 &pixsCLBuffer); 01846 status = clSetKernelArg(rEnv.mpkKernel, 01847 1, 01848 sizeof(cl_mem), 01849 &pixdCLBuffer); 01850 status = clSetKernelArg(rEnv.mpkKernel, 01851 2, 01852 sizeof(xp), 01853 (const void *)&xp); 01854 status = clSetKernelArg(rEnv.mpkKernel, 01855 3, 01856 sizeof(wpl), 01857 (const void *)&wpl); 01858 status = clSetKernelArg(rEnv.mpkKernel, 01859 4, 01860 sizeof(h), 01861 (const void *)&h); 01862 status = clSetKernelArg(rEnv.mpkKernel, 01863 5, 01864 sizeof(isAsymmetric), 01865 (const void *)&isAsymmetric); 01866 status = clSetKernelArg(rEnv.mpkKernel, 01867 6, 01868 sizeof(rwmask), 01869 (const void *)&rwmask); 01870 status = clSetKernelArg(rEnv.mpkKernel, 01871 7, 01872 sizeof(lwmask), 01873 (const void *)&lwmask); 01874 status = clSetKernelArg(rEnv.mpkKernel, 01875 8, 01876 sizeof(isEven), 01877 (const void *)&isEven); 01878 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01879 rEnv.mpkKernel, 01880 2, 01881 NULL, 01882 globalThreads, 01883 localThreads, 01884 0, 01885 NULL, 01886 NULL); 01887 01888 if (yp > 0 || yn > 0) 01889 { 01890 pixtemp = pixsCLBuffer; 01891 pixsCLBuffer = pixdCLBuffer; 01892 pixdCLBuffer = pixtemp; 01893 } 01894 } 01895 01896 //Vertical Pass 01897 if (yp > 0 || yn > 0) 01898 { 01899 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status ); 01900 01901 status = clSetKernelArg(rEnv.mpkKernel, 01902 0, 01903 sizeof(cl_mem), 01904 &pixsCLBuffer); 01905 status = clSetKernelArg(rEnv.mpkKernel, 01906 1, 01907 sizeof(cl_mem), 01908 &pixdCLBuffer); 01909 status = clSetKernelArg(rEnv.mpkKernel, 01910 2, 01911 sizeof(yp), 01912 (const void *)&yp); 01913 status = clSetKernelArg(rEnv.mpkKernel, 01914 3, 01915 sizeof(wpl), 01916 (const void *)&wpl); 01917 status = clSetKernelArg(rEnv.mpkKernel, 01918 4, 01919 sizeof(h), 01920 (const void *)&h); 01921 status = clSetKernelArg(rEnv.mpkKernel, 01922 5, 01923 sizeof(isAsymmetric), 01924 (const void *)&isAsymmetric); 01925 status = clSetKernelArg(rEnv.mpkKernel, 01926 6, 01927 sizeof(yn), 01928 (const void *)&yn); 01929 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01930 rEnv.mpkKernel, 01931 2, 01932 NULL, 01933 globalThreads, 01934 localThreads, 01935 0, 01936 NULL, 01937 NULL); 01938 } 01939 01940 return status; 01941 } 01942 01943 // OpenCL implementation of Morphology Dilate 01944 //Note: Assumes the source and dest opencl buffer are initialized. No check done 01945 PIX* 01946 OpenclDevice::pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) 01947 { 01948 l_uint32 wpl, h; 01949 01950 wpl = pixGetWpl(pixs); 01951 h = pixGetHeight(pixs); 01952 01953 clStatus = pixDilateCL(hsize, vsize, wpl, h); 01954 01955 if (reqDataCopy) 01956 { 01957 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false); 01958 } 01959 01960 return pixd; 01961 } 01962 01963 // OpenCL implementation of Morphology Erode 01964 //Note: Assumes the source and dest opencl buffer are initialized. No check done 01965 PIX* 01966 OpenclDevice::pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) 01967 { 01968 l_uint32 wpl, h; 01969 01970 wpl = pixGetWpl(pixs); 01971 h = pixGetHeight(pixs); 01972 01973 clStatus = pixErodeCL(hsize, vsize, wpl, h); 01974 01975 if (reqDataCopy) 01976 { 01977 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 01978 } 01979 01980 return pixd; 01981 } 01982 01983 //Morphology Open operation. Invokes the relevant OpenCL kernels 01984 cl_int 01985 pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) 01986 { 01987 cl_int status; 01988 cl_mem pixtemp; 01989 01990 //Erode followed by Dilate 01991 status = pixErodeCL(hsize, vsize, wpl, h); 01992 01993 pixtemp = pixsCLBuffer; 01994 pixsCLBuffer = pixdCLBuffer; 01995 pixdCLBuffer = pixtemp; 01996 01997 status = pixDilateCL(hsize, vsize, wpl, h); 01998 01999 return status; 02000 } 02001 02002 //Morphology Close operation. Invokes the relevant OpenCL kernels 02003 cl_int 02004 pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) 02005 { 02006 cl_int status; 02007 cl_mem pixtemp; 02008 02009 //Dilate followed by Erode 02010 status = pixDilateCL(hsize, vsize, wpl, h); 02011 02012 pixtemp = pixsCLBuffer; 02013 pixsCLBuffer = pixdCLBuffer; 02014 pixdCLBuffer = pixtemp; 02015 02016 status = pixErodeCL(hsize, vsize, wpl, h); 02017 02018 return status; 02019 } 02020 02021 // OpenCL implementation of Morphology Close 02022 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02023 PIX* 02024 OpenclDevice::pixCloseBrickCL(PIX *pixd, 02025 PIX *pixs, 02026 l_int32 hsize, 02027 l_int32 vsize, 02028 bool reqDataCopy = false) 02029 { 02030 l_uint32 wpl, h; 02031 02032 wpl = pixGetWpl(pixs); 02033 h = pixGetHeight(pixs); 02034 02035 clStatus = pixCloseCL(hsize, vsize, wpl, h); 02036 02037 if (reqDataCopy) 02038 { 02039 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 02040 } 02041 02042 return pixd; 02043 } 02044 02045 // OpenCL implementation of Morphology Open 02046 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02047 PIX* 02048 OpenclDevice::pixOpenBrickCL(PIX *pixd, 02049 PIX *pixs, 02050 l_int32 hsize, 02051 l_int32 vsize, 02052 bool reqDataCopy = false) 02053 { 02054 l_uint32 wpl, h; 02055 02056 wpl = pixGetWpl(pixs); 02057 h = pixGetHeight(pixs); 02058 02059 clStatus = pixOpenCL(hsize, vsize, wpl, h); 02060 02061 if (reqDataCopy) 02062 { 02063 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 02064 } 02065 02066 return pixd; 02067 } 02068 02069 //pix OR operation: outbuffer = buffer1 | buffer2 02070 cl_int 02071 pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) 02072 { 02073 cl_int status; 02074 size_t globalThreads[2]; 02075 int gsize; 02076 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; 02077 02078 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 02079 globalThreads[0] = gsize; 02080 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 02081 globalThreads[1] = gsize; 02082 02083 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status ); 02084 02085 status = clSetKernelArg(rEnv.mpkKernel, 02086 0, 02087 sizeof(cl_mem), 02088 &buffer1); 02089 status = clSetKernelArg(rEnv.mpkKernel, 02090 1, 02091 sizeof(cl_mem), 02092 &buffer2); 02093 status = clSetKernelArg(rEnv.mpkKernel, 02094 2, 02095 sizeof(cl_mem), 02096 &outbuffer); 02097 status = clSetKernelArg(rEnv.mpkKernel, 02098 3, 02099 sizeof(wpl), 02100 (const void *)&wpl); 02101 status = clSetKernelArg(rEnv.mpkKernel, 02102 4, 02103 sizeof(h), 02104 (const void *)&h); 02105 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 02106 rEnv.mpkKernel, 02107 2, 02108 NULL, 02109 globalThreads, 02110 localThreads, 02111 0, 02112 NULL, 02113 NULL); 02114 02115 return status; 02116 } 02117 02118 //pix AND operation: outbuffer = buffer1 & buffer2 02119 cl_int 02120 pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) 02121 { 02122 cl_int status; 02123 size_t globalThreads[2]; 02124 int gsize; 02125 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; 02126 02127 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 02128 globalThreads[0] = gsize; 02129 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 02130 globalThreads[1] = gsize; 02131 02132 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status ); 02133 02134 // Enqueue a kernel run call. 02135 status = clSetKernelArg(rEnv.mpkKernel, 02136 0, 02137 sizeof(cl_mem), 02138 &buffer1); 02139 status = clSetKernelArg(rEnv.mpkKernel, 02140 1, 02141 sizeof(cl_mem), 02142 &buffer2); 02143 status = clSetKernelArg(rEnv.mpkKernel, 02144 2, 02145 sizeof(cl_mem), 02146 &outbuffer); 02147 status = clSetKernelArg(rEnv.mpkKernel, 02148 3, 02149 sizeof(wpl), 02150 (const void *)&wpl); 02151 status = clSetKernelArg(rEnv.mpkKernel, 02152 4, 02153 sizeof(h), 02154 (const void *)&h); 02155 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 02156 rEnv.mpkKernel, 02157 2, 02158 NULL, 02159 globalThreads, 02160 localThreads, 02161 0, 02162 NULL, 02163 NULL); 02164 02165 return status; 02166 } 02167 02168 //output = buffer1 & ~(buffer2) 02169 cl_int 02170 pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL) 02171 { 02172 cl_int status; 02173 size_t globalThreads[2]; 02174 int gsize; 02175 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; 02176 02177 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 02178 globalThreads[0] = gsize; 02179 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 02180 globalThreads[1] = gsize; 02181 02182 if (outBuffer != NULL) 02183 { 02184 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status ); 02185 } 02186 else 02187 { 02188 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status ); 02189 } 02190 02191 // Enqueue a kernel run call. 02192 status = clSetKernelArg(rEnv.mpkKernel, 02193 0, 02194 sizeof(cl_mem), 02195 &buffer1); 02196 status = clSetKernelArg(rEnv.mpkKernel, 02197 1, 02198 sizeof(cl_mem), 02199 &buffer2); 02200 status = clSetKernelArg(rEnv.mpkKernel, 02201 2, 02202 sizeof(wpl), 02203 (const void *)&wpl); 02204 status = clSetKernelArg(rEnv.mpkKernel, 02205 3, 02206 sizeof(h), 02207 (const void *)&h); 02208 if (outBuffer != NULL) 02209 { 02210 status = clSetKernelArg(rEnv.mpkKernel, 02211 4, 02212 sizeof(cl_mem), 02213 (const void *)&outBuffer); 02214 } 02215 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 02216 rEnv.mpkKernel, 02217 2, 02218 NULL, 02219 globalThreads, 02220 localThreads, 02221 0, 02222 NULL, 02223 NULL); 02224 02225 return status; 02226 } 02227 02228 // OpenCL implementation of Subtract pix 02229 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02230 PIX* 02231 OpenclDevice::pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy = false) 02232 { 02233 l_uint32 wpl, h; 02234 02235 PROCNAME("pixSubtractCL"); 02236 02237 if (!pixs1) 02238 return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd); 02239 if (!pixs2) 02240 return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd); 02241 if (pixGetDepth(pixs1) != pixGetDepth(pixs2)) 02242 return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd); 02243 02244 #if EQUAL_SIZE_WARNING 02245 if (!pixSizesEqual(pixs1, pixs2)) 02246 L_WARNING("pixs1 and pixs2 not equal sizes", procName); 02247 #endif /* EQUAL_SIZE_WARNING */ 02248 02249 wpl = pixGetWpl(pixs1); 02250 h = pixGetHeight(pixs1); 02251 02252 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); 02253 02254 if (reqDataCopy) 02255 { 02256 //Read back output data from OCL buffer to cpu 02257 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ); 02258 } 02259 02260 return pixd; 02261 } 02262 02263 // OpenCL implementation of Hollow pix 02264 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02265 PIX* 02266 OpenclDevice::pixHollowCL(PIX *pixd, 02267 PIX *pixs, 02268 l_int32 close_hsize, 02269 l_int32 close_vsize, 02270 l_int32 open_hsize, 02271 l_int32 open_vsize, 02272 bool reqDataCopy = false) 02273 { 02274 l_uint32 wpl, h; 02275 cl_mem pixtemp; 02276 02277 wpl = pixGetWpl(pixs); 02278 h = pixGetHeight(pixs); 02279 02280 //First step : Close Morph operation: Dilate followed by Erode 02281 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); 02282 02283 //Store the output of close operation in an intermediate buffer 02284 //this will be later used for pixsubtract 02285 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); 02286 02287 //Second step: Open Operation - Erode followed by Dilate 02288 pixtemp = pixsCLBuffer; 02289 pixsCLBuffer = pixdCLBuffer; 02290 pixdCLBuffer = pixtemp; 02291 02292 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); 02293 02294 //Third step: Subtract : (Close - Open) 02295 pixtemp = pixsCLBuffer; 02296 pixsCLBuffer = pixdCLBuffer; 02297 pixdCLBuffer = pixdCLIntermediate; 02298 pixdCLIntermediate = pixtemp; 02299 02300 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); 02301 02302 if (reqDataCopy) 02303 { 02304 //Read back output data from OCL buffer to cpu 02305 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 02306 } 02307 return pixd; 02308 } 02309 02310 // OpenCL implementation of Get Lines from pix function 02311 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02312 void 02313 OpenclDevice::pixGetLinesCL(PIX *pixd, 02314 PIX *pixs, 02315 PIX** pix_vline, 02316 PIX** pix_hline, 02317 PIX** pixClosed, 02318 bool getpixClosed, 02319 l_int32 close_hsize, l_int32 close_vsize, 02320 l_int32 open_hsize, l_int32 open_vsize, 02321 l_int32 line_hsize, l_int32 line_vsize) 02322 { 02323 l_uint32 wpl, h; 02324 cl_mem pixtemp; 02325 02326 wpl = pixGetWpl(pixs); 02327 h = pixGetHeight(pixs); 02328 02329 //First step : Close Morph operation: Dilate followed by Erode 02330 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); 02331 02332 //Copy the Close output to CPU buffer 02333 if (getpixClosed) 02334 { 02335 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false); 02336 } 02337 02338 //Store the output of close operation in an intermediate buffer 02339 //this will be later used for pixsubtract 02340 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); 02341 02342 //Second step: Open Operation - Erode followed by Dilate 02343 pixtemp = pixsCLBuffer; 02344 pixsCLBuffer = pixdCLBuffer; 02345 pixdCLBuffer = pixtemp; 02346 02347 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); 02348 02349 //Third step: Subtract : (Close - Open) 02350 pixtemp = pixsCLBuffer; 02351 pixsCLBuffer = pixdCLBuffer; 02352 pixdCLBuffer = pixdCLIntermediate; 02353 pixdCLIntermediate = pixtemp; 02354 02355 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); 02356 02357 //Store the output of Hollow operation in an intermediate buffer 02358 //this will be later used 02359 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); 02360 02361 pixtemp = pixsCLBuffer; 02362 pixsCLBuffer = pixdCLBuffer; 02363 pixdCLBuffer = pixtemp; 02364 02365 //Fourth step: Get vertical line 02366 //pixOpenBrick(NULL, pix_hollow, 1, min_line_length); 02367 clStatus = pixOpenCL(1, line_vsize, wpl, h); 02368 02369 //Copy the vertical line output to CPU buffer 02370 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false); 02371 02372 pixtemp = pixsCLBuffer; 02373 pixsCLBuffer = pixdCLIntermediate; 02374 pixdCLIntermediate = pixtemp; 02375 02376 //Fifth step: Get horizontal line 02377 //pixOpenBrick(NULL, pix_hollow, min_line_length, 1); 02378 clStatus = pixOpenCL(line_hsize, 1, wpl, h); 02379 02380 //Copy the horizontal line output to CPU buffer 02381 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true); 02382 02383 return; 02384 } 02385 02386 02387 /************************************************************************* 02388 * HistogramRect 02389 * Otsu Thresholding Operations 02390 * histogramAllChannels is laid out as all channel 0, then all channel 1... 02391 * only supports 1 or 4 channels (bytes_per_pixel) 02392 ************************************************************************/ 02393 int OpenclDevice::HistogramRectOCL( 02394 const unsigned char* imageData, 02395 int bytes_per_pixel, 02396 int bytes_per_line, 02397 int left, // always 0 02398 int top, // always 0 02399 int width, 02400 int height, 02401 int kHistogramSize, 02402 int* histogramAllChannels) 02403 { 02404 PERF_COUNT_START("HistogramRectOCL") 02405 cl_int clStatus; 02406 int retVal= 0; 02407 KernelEnv histKern; 02408 SetKernelEnv( &histKern ); 02409 KernelEnv histRedKern; 02410 SetKernelEnv( &histRedKern ); 02411 /* map imagedata to device as read only */ 02412 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. 02413 // faster option would be to allocate initial image buffer 02414 // using a garlic bus memory type 02415 cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); 02416 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02417 02418 /* setup work group size parameters */ 02419 int block_size = 256; 02420 cl_uint numCUs; 02421 clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL); 02422 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02423 02424 int requestedOccupancy = 10; 02425 int numWorkGroups = numCUs * requestedOccupancy; 02426 int numThreads = block_size*numWorkGroups; 02427 size_t local_work_size[] = {static_cast<size_t>(block_size)}; 02428 size_t global_work_size[] = {static_cast<size_t>(numThreads)}; 02429 size_t red_global_work_size[] = {static_cast<size_t>(block_size*kHistogramSize*bytes_per_pixel)}; 02430 02431 /* map histogramAllChannels as write only */ 02432 int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups; 02433 02434 cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus ); 02435 CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer"); 02436 02437 /* intermediate histogram buffer */ 02438 int histRed = 256; 02439 int tmpHistogramBins = kHistogramSize*bytes_per_pixel*histRed; 02440 02441 cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus ); 02442 CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer"); 02443 02444 /* atomic sync buffer */ 02445 int *zeroBuffer = new int[1]; 02446 zeroBuffer[0] = 0; 02447 cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus ); 02448 CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer"); 02449 delete[] zeroBuffer; 02450 //Create kernel objects based on bytes_per_pixel 02451 if (bytes_per_pixel == 1) 02452 { 02453 histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus ); 02454 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel"); 02455 02456 histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus ); 02457 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction"); 02458 } else { 02459 histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus ); 02460 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels"); 02461 02462 histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus ); 02463 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction"); 02464 } 02465 02466 void *ptr; 02467 02468 //Initialize tmpHistogramBuffer buffer 02469 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus); 02470 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer"); 02471 02472 memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint)); 02473 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL); 02474 02475 /* set kernel 1 arguments */ 02476 clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); 02477 CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); 02478 cl_uint numPixels = width*height; 02479 clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels ); 02480 CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" ); 02481 clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); 02482 CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); 02483 02484 /* set kernel 2 arguments */ 02485 int n = numThreads/bytes_per_pixel; 02486 clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n ); 02487 CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); 02488 clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); 02489 CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); 02490 clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer ); 02491 CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer"); 02492 02493 /* launch histogram */ 02494 PERF_COUNT_SUB("before") 02495 clStatus = clEnqueueNDRangeKernel( 02496 histKern.mpkCmdQueue, 02497 histKern.mpkKernel, 02498 1, NULL, global_work_size, local_work_size, 02499 0, NULL, NULL ); 02500 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" ); 02501 clFinish( histKern.mpkCmdQueue ); 02502 if(clStatus !=0) 02503 { 02504 retVal = -1; 02505 } 02506 /* launch histogram */ 02507 clStatus = clEnqueueNDRangeKernel( 02508 histRedKern.mpkCmdQueue, 02509 histRedKern.mpkKernel, 02510 1, NULL, red_global_work_size, local_work_size, 02511 0, NULL, NULL ); 02512 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" ); 02513 clFinish( histRedKern.mpkCmdQueue ); 02514 if(clStatus !=0) 02515 { 02516 retVal = -1; 02517 } 02518 PERF_COUNT_SUB("redKernel") 02519 02520 /* map results back from gpu */ 02521 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus); 02522 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); 02523 if(clStatus !=0) 02524 { 02525 retVal = -1; 02526 } 02527 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL); 02528 02529 clReleaseMemObject(histogramBuffer); 02530 clReleaseMemObject(imageBuffer); 02531 PERF_COUNT_SUB("after") 02532 PERF_COUNT_END 02533 return retVal; 02534 02535 } 02536 02537 /************************************************************************* 02538 * Threshold the rectangle, taking everything except the image buffer pointer 02539 * from the class, using thresholds/hi_values to the output IMAGE. 02540 * only supports 1 or 4 channels 02541 ************************************************************************/ 02542 int OpenclDevice::ThresholdRectToPixOCL( 02543 const unsigned char* imageData, 02544 int bytes_per_pixel, 02545 int bytes_per_line, 02546 const int* thresholds, 02547 const int* hi_values, 02548 Pix** pix, 02549 int height, 02550 int width, 02551 int top, 02552 int left) { 02553 PERF_COUNT_START("ThresholdRectToPixOCL") 02554 int retVal =0; 02555 /* create pix result buffer */ 02556 *pix = pixCreate(width, height, 1); 02557 uinT32* pixData = pixGetData(*pix); 02558 int wpl = pixGetWpl(*pix); 02559 int pixSize = wpl*height*sizeof(uinT32); // number of pixels 02560 02561 cl_int clStatus; 02562 KernelEnv rEnv; 02563 SetKernelEnv( &rEnv ); 02564 02565 /* setup work group size parameters */ 02566 int block_size = 256; 02567 cl_uint numCUs = 6; 02568 clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL); 02569 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02570 02571 int requestedOccupancy = 10; 02572 int numWorkGroups = numCUs * requestedOccupancy; 02573 int numThreads = block_size*numWorkGroups; 02574 size_t local_work_size[] = {(size_t) block_size}; 02575 size_t global_work_size[] = {(size_t) numThreads}; 02576 02577 /* map imagedata to device as read only */ 02578 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. 02579 // faster option would be to allocate initial image buffer 02580 // using a garlic bus memory type 02581 cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); 02582 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02583 02584 /* map pix as write only */ 02585 pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus ); 02586 CHECK_OPENCL( clStatus, "clCreateBuffer pix"); 02587 02588 /* map thresholds and hi_values */ 02589 cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus ); 02590 CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer"); 02591 cl_mem hiValuesBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus ); 02592 CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer"); 02593 02594 /* compile kernel */ 02595 if (bytes_per_pixel == 4) { 02596 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus ); 02597 CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix"); 02598 } else { 02599 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus ); 02600 CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan"); 02601 } 02602 02603 /* set kernel arguments */ 02604 clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); 02605 CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); 02606 cl_uint numPixels = width*height; 02607 clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height ); 02608 CHECK_OPENCL( clStatus, "clSetKernelArg height" ); 02609 clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width ); 02610 CHECK_OPENCL( clStatus, "clSetKernelArg width" ); 02611 clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl ); 02612 CHECK_OPENCL( clStatus, "clSetKernelArg wpl" ); 02613 clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer ); 02614 CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" ); 02615 clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer ); 02616 CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" ); 02617 clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer ); 02618 CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer"); 02619 02620 /* launch kernel & wait */ 02621 PERF_COUNT_SUB("before") 02622 clStatus = clEnqueueNDRangeKernel( 02623 rEnv.mpkCmdQueue, 02624 rEnv.mpkKernel, 02625 1, NULL, global_work_size, local_work_size, 02626 0, NULL, NULL ); 02627 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" ); 02628 clFinish( rEnv.mpkCmdQueue ); 02629 PERF_COUNT_SUB("kernel") 02630 if(clStatus !=0) 02631 { 02632 printf("Setting return value to -1\n"); 02633 retVal = -1; 02634 } 02635 /* map results back from gpu */ 02636 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus); 02637 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); 02638 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL); 02639 02640 clReleaseMemObject(imageBuffer); 02641 clReleaseMemObject(thresholdsBuffer); 02642 clReleaseMemObject(hiValuesBuffer); 02643 02644 PERF_COUNT_SUB("after") 02645 PERF_COUNT_END 02646 return retVal; 02647 } 02648 02649 02650 02651 /****************************************************************************** 02652 * Data Types for Device Selection 02653 *****************************************************************************/ 02654 02655 typedef struct _TessScoreEvaluationInputData { 02656 int height; 02657 int width; 02658 int numChannels; 02659 unsigned char *imageData; 02660 Pix *pix; 02661 } TessScoreEvaluationInputData; 02662 02663 void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) { 02664 srand(1); 02665 // 8.5x11 inches @ 300dpi rounded to clean multiples 02666 int height = 3328; // %256 02667 int width = 2560; // %512 02668 int numChannels = 4; 02669 input->height = height; 02670 input->width = width; 02671 input->numChannels = numChannels; 02672 unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width]; 02673 input->imageData = (unsigned char *) &imageData4[0]; 02674 02675 // zero out image 02676 unsigned char pixelWhite[4] = { 0, 0, 0, 255}; 02677 unsigned char pixelBlack[4] = {255, 255, 255, 255}; 02678 for (int p = 0; p < height*width; p++) { 02679 //unsigned char tmp[4] = imageData4[0]; 02680 imageData4[p][0] = pixelWhite[0]; 02681 imageData4[p][1] = pixelWhite[1]; 02682 imageData4[p][2] = pixelWhite[2]; 02683 imageData4[p][3] = pixelWhite[3]; 02684 } 02685 // random lines to be eliminated 02686 int maxLineWidth = 64; // pixels wide 02687 int numLines = 10; 02688 // vertical lines 02689 for (int i = 0; i < numLines; i++) { 02690 int lineWidth = rand()%maxLineWidth; 02691 int vertLinePos = lineWidth + rand()%(width-2*lineWidth); 02692 //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth); 02693 for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) { 02694 for (int col = 0; col < height; col++) { 02695 //imageData4[row*width+col] = pixelBlack; 02696 imageData4[row*width+col][0] = pixelBlack[0]; 02697 imageData4[row*width+col][1] = pixelBlack[1]; 02698 imageData4[row*width+col][2] = pixelBlack[2]; 02699 imageData4[row*width+col][3] = pixelBlack[3]; 02700 } 02701 } 02702 } 02703 // horizontal lines 02704 for (int i = 0; i < numLines; i++) { 02705 int lineWidth = rand()%maxLineWidth; 02706 int horLinePos = lineWidth + rand()%(height-2*lineWidth); 02707 //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth); 02708 for (int row = 0; row < width; row++) { 02709 for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) { 02710 //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col); 02711 //imageData4[row*width+col] = pixelBlack; 02712 imageData4[row*width+col][0] = pixelBlack[0]; 02713 imageData4[row*width+col][1] = pixelBlack[1]; 02714 imageData4[row*width+col][2] = pixelBlack[2]; 02715 imageData4[row*width+col][3] = pixelBlack[3]; 02716 } 02717 } 02718 } 02719 // spots (noise, squares) 02720 float fractionBlack = 0.1; // how much of the image should be blackened 02721 int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2); 02722 for (int i = 0; i < numSpots; i++) { 02723 02724 int lineWidth = rand()%maxLineWidth; 02725 int col = lineWidth + rand()%(width-2*lineWidth); 02726 int row = lineWidth + rand()%(height-2*lineWidth); 02727 //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col ); 02728 for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) { 02729 for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) { 02730 //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c ); 02731 //imageData4[row*width+col] = pixelBlack; 02732 imageData4[r*width+c][0] = pixelBlack[0]; 02733 imageData4[r*width+c][1] = pixelBlack[1]; 02734 imageData4[r*width+c][2] = pixelBlack[2]; 02735 imageData4[r*width+c][3] = pixelBlack[3]; 02736 } 02737 } 02738 } 02739 02740 input->pix = pixCreate(input->width, input->height, 1); 02741 } 02742 02743 typedef struct _TessDeviceScore { 02744 float time; // small time means faster device 02745 bool clError; // were there any opencl errors 02746 bool valid; // was the correct response generated 02747 } TessDeviceScore; 02748 02749 /****************************************************************************** 02750 * Micro Benchmarks for Device Selection 02751 *****************************************************************************/ 02752 02753 double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 02754 02755 double time = 0; 02756 #if ON_WINDOWS 02757 LARGE_INTEGER freq, time_funct_start, time_funct_end; 02758 QueryPerformanceFrequency(&freq); 02759 #elif ON_APPLE 02760 mach_timebase_info_data_t info = { 0, 0 }; 02761 mach_timebase_info(&info); 02762 long long start,stop; 02763 #else 02764 timespec time_funct_start, time_funct_end; 02765 #endif 02766 // input data 02767 l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload 02768 02769 // function call 02770 if (type == DS_DEVICE_OPENCL_DEVICE) { 02771 #if ON_WINDOWS 02772 QueryPerformanceCounter(&time_funct_start); 02773 #elif ON_APPLE 02774 start = mach_absolute_time(); 02775 #else 02776 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02777 #endif 02778 02779 OpenclDevice::gpuEnv = *env; 02780 int wpl = pixGetWpl(input.pix); 02781 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL); 02782 #if ON_WINDOWS 02783 QueryPerformanceCounter(&time_funct_end); 02784 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02785 #elif ON_APPLE 02786 stop = mach_absolute_time(); 02787 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; 02788 #else 02789 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02790 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02791 #endif 02792 02793 } else { 02794 #if ON_WINDOWS 02795 QueryPerformanceCounter(&time_funct_start); 02796 #elif ON_APPLE 02797 start = mach_absolute_time(); 02798 #else 02799 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02800 #endif 02801 Pix *pix = pixCreate(input.width, input.height, 32); 02802 l_uint32 *pixData = pixGetData(pix); 02803 int wpl = pixGetWpl(pix); 02804 //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line); 02805 //pixSetData(pix, output_gpu); 02806 int i, j; 02807 int idx = 0; 02808 for (i = 0; i < input.height ; i++) { 02809 for (j = 0; j < input.width; j++) { 02810 02811 l_uint32 tiffword = tiffdata[i * input.width + j]; 02812 l_int32 rval = ((tiffword) & 0xff); 02813 l_int32 gval = (((tiffword) >> 8) & 0xff); 02814 l_int32 bval = (((tiffword) >> 16) & 0xff); 02815 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8); 02816 pixData[idx] = value; 02817 idx++; 02818 } 02819 } 02820 #if ON_WINDOWS 02821 QueryPerformanceCounter(&time_funct_end); 02822 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02823 #elif ON_APPLE 02824 stop = mach_absolute_time(); 02825 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; 02826 #else 02827 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02828 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02829 #endif 02830 pixDestroy(&pix); 02831 } 02832 02833 02834 // cleanup 02835 02836 return time; 02837 } 02838 02839 double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 02840 02841 double time; 02842 #if ON_WINDOWS 02843 LARGE_INTEGER freq, time_funct_start, time_funct_end; 02844 QueryPerformanceFrequency(&freq); 02845 #elif ON_APPLE 02846 mach_timebase_info_data_t info = { 0, 0 }; 02847 mach_timebase_info(&info); 02848 long long start,stop; 02849 #else 02850 timespec time_funct_start, time_funct_end; 02851 #endif 02852 02853 unsigned char pixelHi = (unsigned char)255; 02854 02855 int left = 0; 02856 int top = 0; 02857 int kHistogramSize = 256; 02858 int bytes_per_line = input.width*input.numChannels; 02859 int *histogramAllChannels = new int[kHistogramSize*input.numChannels]; 02860 int retVal= 0; 02861 // function call 02862 if (type == DS_DEVICE_OPENCL_DEVICE) { 02863 #if ON_WINDOWS 02864 QueryPerformanceCounter(&time_funct_start); 02865 #elif ON_APPLE 02866 start = mach_absolute_time(); 02867 #else 02868 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02869 #endif 02870 02871 OpenclDevice::gpuEnv = *env; 02872 int wpl = pixGetWpl(input.pix); 02873 retVal= OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels); 02874 02875 #if ON_WINDOWS 02876 QueryPerformanceCounter(&time_funct_end); 02877 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02878 #elif ON_APPLE 02879 stop = mach_absolute_time(); 02880 if(retVal ==0) 02881 { 02882 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; 02883 } 02884 else 02885 { 02886 time= FLT_MAX; 02887 } 02888 #else 02889 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02890 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02891 #endif 02892 } else { 02893 02894 int *histogram = new int[kHistogramSize]; 02895 #if ON_WINDOWS 02896 QueryPerformanceCounter(&time_funct_start); 02897 #elif ON_APPLE 02898 start = mach_absolute_time(); 02899 #else 02900 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02901 #endif 02902 for (int ch = 0; ch < input.numChannels; ++ch) { 02903 tesseract::HistogramRect(input.pix, input.numChannels, 02904 left, top, input.width, input.height, histogram); 02905 } 02906 #if ON_WINDOWS 02907 QueryPerformanceCounter(&time_funct_end); 02908 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02909 #elif ON_APPLE 02910 stop = mach_absolute_time(); 02911 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; 02912 #else 02913 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02914 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02915 #endif 02916 delete[] histogram; 02917 } 02918 02919 // cleanup 02920 delete[] histogramAllChannels; 02921 return time; 02922 } 02923 02924 //Reproducing the ThresholdRectToPix native version 02925 void ThresholdRectToPix_Native(const unsigned char* imagedata, 02926 int bytes_per_pixel, 02927 int bytes_per_line, 02928 const int* thresholds, 02929 const int* hi_values, 02930 Pix** pix) { 02931 int top = 0; 02932 int left = 0; 02933 int width = pixGetWidth(*pix); 02934 int height = pixGetHeight(*pix); 02935 02936 *pix = pixCreate(width, height, 1); 02937 uinT32* pixdata = pixGetData(*pix); 02938 int wpl = pixGetWpl(*pix); 02939 const unsigned char* srcdata = imagedata + top * bytes_per_line + 02940 left * bytes_per_pixel; 02941 for (int y = 0; y < height; ++y) { 02942 const uinT8* linedata = srcdata; 02943 uinT32* pixline = pixdata + y * wpl; 02944 for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) { 02945 bool white_result = true; 02946 for (int ch = 0; ch < bytes_per_pixel; ++ch) { 02947 if (hi_values[ch] >= 0 && 02948 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) { 02949 white_result = false; 02950 break; 02951 } 02952 } 02953 if (white_result) 02954 CLEAR_DATA_BIT(pixline, x); 02955 else 02956 SET_DATA_BIT(pixline, x); 02957 } 02958 srcdata += bytes_per_line; 02959 } 02960 } 02961 02962 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 02963 02964 double time; 02965 int retVal =0; 02966 #if ON_WINDOWS 02967 LARGE_INTEGER freq, time_funct_start, time_funct_end; 02968 QueryPerformanceFrequency(&freq); 02969 #elif ON_APPLE 02970 mach_timebase_info_data_t info = { 0, 0 }; 02971 mach_timebase_info(&info); 02972 long long start,stop; 02973 #else 02974 timespec time_funct_start, time_funct_end; 02975 #endif 02976 02977 // input data 02978 unsigned char pixelHi = (unsigned char)255; 02979 int* thresholds = new int[4]; 02980 thresholds[0] = pixelHi/2; 02981 thresholds[1] = pixelHi/2; 02982 thresholds[2] = pixelHi/2; 02983 thresholds[3] = pixelHi/2; 02984 int *hi_values = new int[4]; 02985 thresholds[0] = pixelHi; 02986 thresholds[1] = pixelHi; 02987 thresholds[2] = pixelHi; 02988 thresholds[3] = pixelHi; 02989 //Pix* pix = pixCreate(width, height, 1); 02990 int top = 0; 02991 int left = 0; 02992 int bytes_per_line = input.width*input.numChannels; 02993 02994 // function call 02995 if (type == DS_DEVICE_OPENCL_DEVICE) { 02996 #if ON_WINDOWS 02997 QueryPerformanceCounter(&time_funct_start); 02998 #elif ON_APPLE 02999 start = mach_absolute_time(); 03000 #else 03001 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 03002 #endif 03003 03004 OpenclDevice::gpuEnv = *env; 03005 int wpl = pixGetWpl(input.pix); 03006 retVal= OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left); 03007 03008 #if ON_WINDOWS 03009 QueryPerformanceCounter(&time_funct_end); 03010 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 03011 #elif ON_APPLE 03012 stop = mach_absolute_time(); 03013 if(retVal ==0) 03014 { 03015 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;; 03016 } 03017 else 03018 { 03019 time= FLT_MAX; 03020 } 03021 03022 #else 03023 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 03024 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 03025 #endif 03026 } else { 03027 03028 03029 tesseract::ImageThresholder thresholder; 03030 thresholder.SetImage( input.pix ); 03031 #if ON_WINDOWS 03032 QueryPerformanceCounter(&time_funct_start); 03033 #elif ON_APPLE 03034 start = mach_absolute_time(); 03035 #else 03036 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 03037 #endif 03038 ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line, 03039 thresholds, hi_values, &input.pix ); 03040 03041 #if ON_WINDOWS 03042 QueryPerformanceCounter(&time_funct_end); 03043 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 03044 #elif ON_APPLE 03045 stop = mach_absolute_time(); 03046 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; 03047 #else 03048 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 03049 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 03050 #endif 03051 } 03052 03053 // cleanup 03054 delete[] thresholds; 03055 delete[] hi_values; 03056 return time; 03057 } 03058 03059 double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 03060 03061 double time = 0; 03062 #if ON_WINDOWS 03063 LARGE_INTEGER freq, time_funct_start, time_funct_end; 03064 QueryPerformanceFrequency(&freq); 03065 #elif ON_APPLE 03066 mach_timebase_info_data_t info = { 0, 0 }; 03067 mach_timebase_info(&info); 03068 long long start,stop; 03069 #else 03070 timespec time_funct_start, time_funct_end; 03071 #endif 03072 03073 // input data 03074 int resolution = 300; 03075 int wpl = pixGetWpl(input.pix); 03076 int kThinLineFraction = 20; // tess constant 03077 int kMinLineLengthFraction = 4; // tess constant 03078 int max_line_width = resolution / kThinLineFraction; 03079 int min_line_length = resolution / kMinLineLengthFraction; 03080 int closing_brick = max_line_width / 3; 03081 03082 // function call 03083 if (type == DS_DEVICE_OPENCL_DEVICE) { 03084 #if ON_WINDOWS 03085 QueryPerformanceCounter(&time_funct_start); 03086 #elif ON_APPLE 03087 start = mach_absolute_time(); 03088 #else 03089 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 03090 #endif 03091 Pix *src_pix = input.pix; 03092 OpenclDevice::gpuEnv = *env; 03093 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix); 03094 Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL; 03095 OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length); 03096 03097 OpenclDevice::releaseMorphCLBuffers(); 03098 03099 #if ON_WINDOWS 03100 QueryPerformanceCounter(&time_funct_end); 03101 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 03102 #elif ON_APPLE 03103 stop = mach_absolute_time(); 03104 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; 03105 #else 03106 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 03107 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 03108 #endif 03109 } else { 03110 #if ON_WINDOWS 03111 QueryPerformanceCounter(&time_funct_start); 03112 #elif ON_APPLE 03113 start = mach_absolute_time(); 03114 #else 03115 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 03116 #endif 03117 03118 // native serial code 03119 Pix *src_pix = input.pix; 03120 Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick); 03121 Pix *pix_solid = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width); 03122 Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid); 03123 pixDestroy(&pix_solid); 03124 Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length); 03125 Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1); 03126 pixDestroy(&pix_hollow); 03127 03128 #if ON_WINDOWS 03129 QueryPerformanceCounter(&time_funct_end); 03130 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 03131 #elif ON_APPLE 03132 stop = mach_absolute_time(); 03133 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; 03134 #else 03135 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 03136 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 03137 #endif 03138 } 03139 03140 return time; 03141 } 03142 03143 03144 03145 /****************************************************************************** 03146 * Device Selection 03147 *****************************************************************************/ 03148 03149 #include "stdlib.h" 03150 03151 03152 // encode score object as byte string 03153 ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) { 03154 *serializedScoreSize = sizeof(TessDeviceScore); 03155 *serializedScore = (void *) new unsigned char[*serializedScoreSize]; 03156 memcpy(*serializedScore, device->score, *serializedScoreSize); 03157 return DS_SUCCESS; 03158 } 03159 03160 // parses byte string and stores in score object 03161 ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) { 03162 // check that serializedScoreSize == sizeof(TessDeviceScore); 03163 device->score = new TessDeviceScore; 03164 memcpy(device->score, serializedScore, serializedScoreSize); 03165 return DS_SUCCESS; 03166 } 03167 03168 ds_status releaseScore( void* score ) { 03169 delete[] score; 03170 return DS_SUCCESS; 03171 } 03172 03173 // evaluate devices 03174 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) { 03175 03176 // overwrite statuc gpuEnv w/ current device 03177 // so native opencl calls can be used; they use static gpuEnv 03178 printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); 03179 GPUEnv *env = NULL; 03180 if (device->type == DS_DEVICE_OPENCL_DEVICE) { 03181 env = new GPUEnv; 03182 //printf("[DS] populating tmp GPUEnv from device\n"); 03183 populateGPUEnvFromDevice( env, device->oclDeviceID); 03184 env->mnFileCount = 0; //argc; 03185 env->mnKernelCount = 0UL; 03186 //printf("[DS] compiling kernels for tmp GPUEnv\n"); 03187 OpenclDevice::gpuEnv = *env; 03188 OpenclDevice::CompileKernelFile(env, ""); 03189 } 03190 03191 TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData; 03192 03193 // pixReadTiff 03194 double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type ); 03195 03196 // HistogramRect 03197 double histogramRectTime = histogramRectMicroBench( env, *input, device->type ); 03198 03199 // ThresholdRectToPix 03200 double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type ); 03201 03202 // getLineMasks 03203 double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type ); 03204 03205 03206 // weigh times (% of cpu time) 03207 // these weights should be the % execution time that the native cpu code took 03208 float composeRGBPixelWeight = 1.2f; 03209 float histogramRectWeight = 2.4f; 03210 float thresholdRectToPixWeight = 4.5f; 03211 float getLineMasksMorphWeight = 5.0f; 03212 03213 float weightedTime = 03214 composeRGBPixelWeight * composeRGBPixelTime + 03215 histogramRectWeight * histogramRectTime + 03216 thresholdRectToPixWeight * thresholdRectToPixTime + 03217 getLineMasksMorphWeight * getLineMasksMorphTime 03218 ; 03219 device->score = (void *)new TessDeviceScore; 03220 ((TessDeviceScore *)device->score)->time = weightedTime; 03221 03222 printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); 03223 printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight ); 03224 printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight ); 03225 printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight ); 03226 printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight ); 03227 printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time ); 03228 return DS_SUCCESS; 03229 } 03230 03231 // initial call to select device 03232 ds_device OpenclDevice::getDeviceSelection( ) { 03233 if (!deviceIsSelected) { 03234 PERF_COUNT_START("getDeviceSelection") 03235 // check if opencl is available at runtime 03236 if( 1 == LoadOpencl() ) { 03237 // opencl is available 03238 //PERF_COUNT_SUB("LoadOpencl") 03239 // setup devices 03240 ds_status status; 03241 ds_profile *profile; 03242 status = initDSProfile( &profile, "v0.1" ); 03243 PERF_COUNT_SUB("initDSProfile") 03244 // try reading scores from file 03245 char *fileName = "tesseract_opencl_profile_devices.dat"; 03246 status = readProfileFromFile( profile, deserializeScore, fileName); 03247 if (status != DS_SUCCESS) { 03248 // need to run evaluation 03249 printf("[DS] Profile file not available (%s); performing profiling.\n", fileName); 03250 03251 // create input data 03252 TessScoreEvaluationInputData input; 03253 populateTessScoreEvaluationInputData( &input ); 03254 //PERF_COUNT_SUB("populateTessScoreEvaluationInputData") 03255 // perform evaluations 03256 unsigned int numUpdates; 03257 status = profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates ); 03258 PERF_COUNT_SUB("profileDevices") 03259 // write scores to file 03260 if ( status == DS_SUCCESS ) { 03261 status = writeProfileToFile( profile, serializeScore, fileName); 03262 PERF_COUNT_SUB("writeProfileToFile") 03263 if ( status == DS_SUCCESS ) { 03264 printf("[DS] Scores written to file (%s).\n", fileName); 03265 } else { 03266 printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName); 03267 } 03268 } else { 03269 printf("[DS] Unable to evaluate performance; scores not written to file.\n"); 03270 } 03271 } else { 03272 03273 PERF_COUNT_SUB("readProfileFromFile") 03274 printf("[DS] Profile read from file (%s).\n", fileName); 03275 } 03276 03277 // we now have device scores either from file or evaluation 03278 // select fastest using custom Tesseract selection algorithm 03279 float bestTime = FLT_MAX; // begin search with worst possible time 03280 int bestDeviceIdx = -1; 03281 for (int d = 0; d < profile->numDevices; d++) { 03282 ds_device device = profile->devices[d]; 03283 TessDeviceScore score = *(TessDeviceScore *)device.score; 03284 03285 float time = score.time; 03286 printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time); 03287 if (time < bestTime) { 03288 bestTime = time; 03289 bestDeviceIdx = d; 03290 } 03291 } 03292 printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); 03293 // cleanup 03294 // TODO: call destructor for profile object? 03295 03296 bool overrided = false; 03297 char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE"); 03298 if (overrideDeviceStr != NULL) { 03299 int overrideDeviceIdx = atoi(overrideDeviceStr); 03300 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) { 03301 printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx); 03302 bestDeviceIdx = overrideDeviceIdx - 1; 03303 overrided = true; 03304 } else { 03305 printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices); 03306 } 03307 } 03308 03309 if (overrided) { 03310 printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); 03311 } 03312 selectedDevice = profile->devices[bestDeviceIdx]; 03313 // cleanup 03314 releaseDSProfile(profile, releaseScore); 03315 } else { 03316 // opencl isn't available at runtime, select native cpu device 03317 printf("[DS] OpenCL runtime not available.\n"); 03318 selectedDevice.type = DS_DEVICE_NATIVE_CPU; 03319 selectedDevice.oclDeviceName = "(null)"; 03320 selectedDevice.score = NULL; 03321 selectedDevice.oclDeviceID = NULL; 03322 selectedDevice.oclDriverVersion = NULL; 03323 } 03324 deviceIsSelected = true; 03325 PERF_COUNT_SUB("select from Profile") 03326 PERF_COUNT_END 03327 } 03328 //PERF_COUNT_END 03329 return selectedDevice; 03330 } 03331 03332 03333 bool OpenclDevice::selectedDeviceIsOpenCL() { 03334 ds_device device = getDeviceSelection(); 03335 return (device.type == DS_DEVICE_OPENCL_DEVICE); 03336 } 03337 03338 bool OpenclDevice::selectedDeviceIsNativeCPU() { 03339 ds_device device = getDeviceSelection(); 03340 return (device.type == DS_DEVICE_NATIVE_CPU); 03341 } 03342 03343 03344 03356 #define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val)) 03357 03358 Pix * OpenclDevice::pixConvertRGBToGrayOCL( 03359 Pix *srcPix, // 32-bit source 03360 float rwt, 03361 float gwt, 03362 float bwt ) 03363 { 03364 PERF_COUNT_START("pixConvertRGBToGrayOCL") 03365 Pix *dstPix; // 8-bit destination 03366 03367 if (rwt < 0.0 || gwt < 0.0 || bwt < 0.0) return NULL; 03368 03369 if (rwt == 0.0 && gwt == 0.0 && bwt == 0.0) { 03370 // magic numbers from leptonica 03371 rwt = 0.3; 03372 gwt = 0.5; 03373 bwt = 0.2; 03374 } 03375 // normalize 03376 float sum = rwt + gwt + bwt; 03377 rwt /= sum; 03378 gwt /= sum; 03379 bwt /= sum; 03380 03381 // source pix 03382 int w, h; 03383 pixGetDimensions(srcPix, &w, &h, NULL); 03384 //printf("Image is %i x %i\n", w, h); 03385 unsigned int *srcData = pixGetData(srcPix); 03386 int srcWPL = pixGetWpl(srcPix); 03387 int srcSize = srcWPL * h * sizeof(unsigned int); 03388 03389 // destination pix 03390 if ((dstPix = pixCreate(w, h, 8)) == NULL) 03391 return NULL; 03392 pixCopyResolution(dstPix, srcPix); 03393 unsigned int *dstData = pixGetData(dstPix); 03394 int dstWPL = pixGetWpl(dstPix); 03395 int dstWords = dstWPL * h; 03396 int dstSize = dstWords * sizeof(unsigned int); 03397 //printf("dstSize = %i\n", dstSize); 03398 PERF_COUNT_SUB("pix setup") 03399 03400 // opencl objects 03401 cl_int clStatus; 03402 KernelEnv kEnv; 03403 SetKernelEnv( &kEnv ); 03404 03405 // source buffer 03406 cl_mem srcBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, srcSize, (void *)srcData, &clStatus ); 03407 CHECK_OPENCL( clStatus, "clCreateBuffer srcBuffer"); 03408 03409 // destination buffer 03410 cl_mem dstBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dstSize, (void *)dstData, &clStatus ); 03411 CHECK_OPENCL( clStatus, "clCreateBuffer dstBuffer"); 03412 03413 // setup work group size parameters 03414 int block_size = 256; 03415 int numWorkGroups = ((h*w+block_size-1) / block_size ); 03416 int numThreads = block_size*numWorkGroups; 03417 size_t local_work_size[] = {static_cast<size_t>(block_size)}; 03418 size_t global_work_size[] = {static_cast<size_t>(numThreads)}; 03419 //printf("Enqueueing %i threads for %i output pixels\n", numThreads, w*h); 03420 03421 /* compile kernel */ 03422 kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, "kernel_RGBToGray", &clStatus ); 03423 CHECK_OPENCL( clStatus, "clCreateKernel kernel_RGBToGray"); 03424 03425 03426 /* set kernel arguments */ 03427 clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&srcBuffer ); 03428 CHECK_OPENCL( clStatus, "clSetKernelArg srcBuffer"); 03429 clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&dstBuffer ); 03430 CHECK_OPENCL( clStatus, "clSetKernelArg dstBuffer"); 03431 clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(int), (void *)&srcWPL ); 03432 CHECK_OPENCL( clStatus, "clSetKernelArg srcWPL" ); 03433 clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(int), (void *)&dstWPL ); 03434 CHECK_OPENCL( clStatus, "clSetKernelArg dstWPL" ); 03435 clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(int), (void *)&h ); 03436 CHECK_OPENCL( clStatus, "clSetKernelArg height" ); 03437 clStatus = clSetKernelArg( kEnv.mpkKernel, 5, sizeof(int), (void *)&w ); 03438 CHECK_OPENCL( clStatus, "clSetKernelArg width" ); 03439 clStatus = clSetKernelArg( kEnv.mpkKernel, 6, sizeof(float), (void *)&rwt ); 03440 CHECK_OPENCL( clStatus, "clSetKernelArg rwt" ); 03441 clStatus = clSetKernelArg( kEnv.mpkKernel, 7, sizeof(float), (void *)&gwt ); 03442 CHECK_OPENCL( clStatus, "clSetKernelArg gwt"); 03443 clStatus = clSetKernelArg( kEnv.mpkKernel, 8, sizeof(float), (void *)&bwt ); 03444 CHECK_OPENCL( clStatus, "clSetKernelArg bwt"); 03445 03446 /* launch kernel & wait */ 03447 PERF_COUNT_SUB("before") 03448 clStatus = clEnqueueNDRangeKernel( 03449 kEnv.mpkCmdQueue, 03450 kEnv.mpkKernel, 03451 1, NULL, global_work_size, local_work_size, 03452 0, NULL, NULL ); 03453 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_RGBToGray" ); 03454 clFinish( kEnv.mpkCmdQueue ); 03455 PERF_COUNT_SUB("kernel") 03456 03457 /* map results back from gpu */ 03458 void *ptr = clEnqueueMapBuffer(kEnv.mpkCmdQueue, dstBuffer, CL_TRUE, CL_MAP_READ, 0, dstSize, 0, NULL, NULL, &clStatus); 03459 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer dstBuffer"); 03460 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, dstBuffer, ptr, 0, NULL, NULL); 03461 03462 #if 0 03463 // validate: compute on cpu 03464 Pix *cpuPix = pixCreate(w, h, 8); 03465 pixCopyResolution(cpuPix, srcPix); 03466 unsigned int *cpuData = pixGetData(cpuPix); 03467 int cpuWPL = pixGetWpl(cpuPix); 03468 unsigned int *cpuLine, *srcLine; 03469 int i, j; 03470 for (i = 0, srcLine = srcData, cpuLine = cpuData; i < h; i++) { 03471 for (j = 0; j < w; j++) { 03472 unsigned int word = *(srcLine + j); 03473 int val = (l_int32)(rwt * ((word >> L_RED_SHIFT) & 0xff) + 03474 gwt * ((word >> L_GREEN_SHIFT) & 0xff) + 03475 bwt * ((word >> L_BLUE_SHIFT) & 0xff) + 0.5); 03476 SET_DATA_BYTE(cpuLine, j, val); 03477 } 03478 srcLine += srcWPL; 03479 cpuLine += cpuWPL; 03480 } 03481 03482 // validate: compare 03483 printf("converted 32-bit -> 8-bit image\n"); 03484 for (int row = 0; row < h; row++) { 03485 for (int col = 0; col < w; col++) { 03486 int idx = row*w + col; 03487 unsigned int srcVal = srcData[idx]; 03488 unsigned char cpuVal = ((unsigned char *)cpuData)[idx]; 03489 unsigned char oclVal = ((unsigned char *)dstData)[idx]; 03490 if (srcVal > 0) { 03491 printf("%4i,%4i: %u, %u, %u\n", row, col, srcVal, cpuVal, oclVal); 03492 } 03493 } 03494 //printf("\n"); 03495 } 03496 #endif 03497 // release opencl objects 03498 clReleaseMemObject(srcBuffer); 03499 clReleaseMemObject(dstBuffer); 03500 03501 03502 PERF_COUNT_END 03503 // success 03504 return dstPix; 03505 } 03506 #endif