tesseract 3.04.01

opencl/openclwrapper.cpp

Go to the documentation of this file.
00001 #ifdef _WIN32
00002 #include <windows.h>
00003 #include <io.h>
00004 
00005 #else
00006 #include <sys/types.h>
00007 #include <unistd.h>
00008 #endif
00009 #include <float.h>
00010 
00011 #include "openclwrapper.h"
00012 #include "oclkernels.h"
00013 
00014 // for micro-benchmark
00015 #include "otsuthr.h"
00016 #include "thresholder.h"
00017 
00018 #if ON_APPLE
00019 #include <stdio.h>
00020 #include <mach/mach_time.h>
00021 #endif
00022 
00023 /*
00024     Convenience macro to test the version of Leptonica.
00025 */
00026 #if defined(LIBLEPT_MAJOR_VERSION) && defined(LIBLEPT_MINOR_VERSION)
00027 #   define TESSERACT_LIBLEPT_PREREQ(maj, min) \
00028         ((LIBLEPT_MAJOR_VERSION) > (maj) || ((LIBLEPT_MAJOR_VERSION) == (maj) && (LIBLEPT_MINOR_VERSION) >= (min)))
00029 #else
00030 #   define TESSERACT_LIBLEPT_PREREQ(maj, min) 0
00031 #endif
00032 
00033 #if TESSERACT_LIBLEPT_PREREQ(1,73)
00034 #   define CALLOC LEPT_CALLOC
00035 #   define FREE LEPT_FREE
00036 #endif
00037 
00038 #ifdef USE_OPENCL
00039 
00040 #include "opencl_device_selection.h"
00041 GPUEnv OpenclDevice::gpuEnv;
00042 
00043 
00044 bool OpenclDevice::deviceIsSelected = false;
00045 ds_device OpenclDevice::selectedDevice;
00046 
00047 
00048 int OpenclDevice::isInited = 0;
00049 
00050 struct tiff_transform {
00051     int vflip;    /* if non-zero, image needs a vertical fip */
00052     int hflip;    /* if non-zero, image needs a horizontal flip */
00053     int rotate;   /* -1 -> counterclockwise 90-degree rotation,
00054                       0 -> no rotation
00055                       1 -> clockwise 90-degree rotation */
00056 };
00057 
00058 static struct tiff_transform tiff_orientation_transforms[] = {
00059     {0, 0, 0},
00060     {0, 1, 0},
00061     {1, 1, 0},
00062     {1, 0, 0},
00063     {0, 1, -1},
00064     {0, 0, 1},
00065     {0, 1, 1},
00066     {0, 0, -1}
00067 };
00068 
00069 static const l_int32  MAX_PAGES_IN_TIFF_FILE = 3000;
00070 
00071 cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
00072 cl_mem pixThBuffer; //output from thresholdtopix calculation
00073 cl_int clStatus;
00074 KernelEnv rEnv;
00075 
00076 // substitute invalid characters in device name with _
00077 void legalizeFileName( char *fileName) {
00078     //printf("fileName: %s\n", fileName);
00079     const char* invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
00080     // for each invalid char
00081     for (int i = 0; i < strlen(invalidChars); i++) {
00082         char invalidStr[4];
00083         invalidStr[0] = invalidChars[i];
00084         invalidStr[1] = NULL;
00085         //printf("eliminating %s\n", invalidStr);
00086         //char *pos = strstr(fileName, invalidStr);
00087         // initial ./ is valid for present directory
00088         //if (*pos == '.') pos++;
00089         //if (*pos == '/') pos++;
00090         for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) {
00091             //printf("\tfound: %s, ", pos);
00092             pos[0] = '_';
00093             //printf("fileName: %s\n", fileName);
00094         }
00095     }
00096 }
00097 
00098 void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
00099     //printf("[DS] populateGPUEnvFromDevice\n");
00100     size_t size;
00101     gpuInfo->mnIsUserCreated = 1;
00102     // device
00103     gpuInfo->mpDevID = device;
00104     gpuInfo->mpArryDevsID = new cl_device_id[1];
00105     gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
00106     clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE       , sizeof(cl_device_type), (void *) &gpuInfo->mDevType       , &size);
00107     CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
00108     // platform
00109     clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM   , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID   , &size);
00110     CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
00111     // context
00112     cl_context_properties props[3];
00113     props[0] = CL_CONTEXT_PLATFORM;
00114     props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
00115     props[2] = 0;
00116     gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus);
00117     CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
00118     // queue
00119     cl_command_queue_properties queueProperties = 0;
00120     gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
00121     CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
00122 
00123 }
00124 
00125 int OpenclDevice::LoadOpencl()
00126 {
00127 #ifdef WIN32
00128     HINSTANCE HOpenclDll = NULL;
00129   void * OpenclDll = NULL;
00130     //fprintf(stderr, " LoadOpenclDllxx... \n");
00131     OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
00132     OpenclDll = LoadLibrary( "openCL.dll" );
00133     if ( !static_cast<HINSTANCE>( OpenclDll ) )
00134     {
00135         fprintf(stderr, "[OD] Load opencl.dll failed!\n");
00136         FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
00137         return 0;
00138 
00139     }
00140     fprintf(stderr, "[OD] Load opencl.dll successful!\n");
00141 #endif
00142     return 1;
00143 }
00144 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
00145 {
00146     envInfo->mpkContext = gpuEnv.mpContext;
00147     envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
00148     envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
00149 
00150     return 1;
00151 }
00152 
00153 cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus)
00154 {
00155     cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
00156                                         nElements * sizeof(l_uint32), hostbuffer, pStatus);
00157 
00158     return membuffer;
00159 }
00160 
00161 PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true)
00162 {
00163     PROCNAME("mapOutputCLBuffer");
00164     if (!pixd)
00165     {
00166         if (memcopy)
00167         {
00168             if ((pixd = pixCreateTemplate(pixs)) == NULL)
00169                 (PIX *)ERROR_PTR("pixd not made", procName, NULL);
00170         }
00171         else
00172         {
00173             if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL)
00174                 (PIX *)ERROR_PTR("pixd not made", procName, NULL);
00175         }
00176     }
00177     l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
00178                                                     elements * sizeof(l_uint32), 0, NULL, NULL, NULL );
00179 
00180     if (memcopy)
00181     {
00182         memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
00183     }
00184     else
00185     {
00186         pixSetData(pixd, pValues);
00187     }
00188 
00189     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL);
00190 
00191     if (sync)
00192     {
00193         clFinish( rEnv.mpkCmdQueue );
00194     }
00195 
00196     return pixd;
00197 }
00198 
00199  cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false)
00200 {
00201     cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
00202         nElements * sizeof(l_int32), NULL, pStatus);
00203 
00204     if (_pValues != NULL)
00205     {
00206         l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
00207             nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
00208 
00209         memcpy(pValues, _pValues, nElements * sizeof(l_int32));
00210 
00211         clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
00212 
00213         if (sync)
00214             clFinish( rEnv.mpkCmdQueue );
00215     }
00216 
00217     return xValues;
00218 }
00219 
00220 
00221 void OpenclDevice::releaseMorphCLBuffers()
00222 {
00223     if (pixdCLIntermediate != NULL)
00224         clReleaseMemObject(pixdCLIntermediate);
00225     if (pixsCLBuffer != NULL)
00226         clReleaseMemObject(pixsCLBuffer);
00227     if (pixdCLBuffer != NULL)
00228         clReleaseMemObject(pixdCLBuffer);
00229     if (pixThBuffer != NULL)
00230         clReleaseMemObject(pixThBuffer);
00231 }
00232 
00233 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs)
00234 {
00235     SetKernelEnv( &rEnv );
00236 
00237     if (pixThBuffer != NULL)
00238     {
00239         pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
00240 
00241         //Get the output from ThresholdToPix operation
00242         clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL);
00243     }
00244     else
00245     {
00246         //Get data from the source image
00247         l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
00248         memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
00249 
00250         pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
00251     }
00252 
00253     pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
00254 
00255     pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
00256 
00257     return (int)clStatus;
00258 }
00259 
00260 int OpenclDevice::InitEnv()
00261 {
00262 //PERF_COUNT_START("OD::InitEnv")
00263 //    printf("[OD] OpenclDevice::InitEnv()\n");
00264 #ifdef SAL_WIN32
00265     while( 1 )
00266     {
00267         if( 1 == LoadOpencl() )
00268             break;
00269     }
00270 PERF_COUNT_SUB("LoadOpencl")
00271 #endif
00272     // sets up environment, compiles programs
00273 
00274 
00275     InitOpenclRunEnv_DeviceSelection( 0 );
00276 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
00277 //PERF_COUNT_END
00278     return 1;
00279 }
00280 
00281 int OpenclDevice::ReleaseOpenclRunEnv()
00282 {
00283     ReleaseOpenclEnv( &gpuEnv );
00284 #ifdef SAL_WIN32
00285     FreeOpenclDll();
00286 #endif
00287     return 1;
00288 }
00289 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
00290 {
00291     if ( kCount < 1 )
00292         fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
00293     strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
00294     gpuEnv.mnKernelCount++;
00295     return 0;
00296 }
00297 int OpenclDevice::RegistOpenclKernel()
00298 {
00299     if ( !gpuEnv.mnIsUserCreated )
00300         memset( &gpuEnv, 0, sizeof(gpuEnv) );
00301 
00302     gpuEnv.mnFileCount = 0; //argc;
00303     gpuEnv.mnKernelCount = 0UL;
00304 
00305     AddKernelConfig( 1, (const char*) "oclAverageSub1" );
00306     return 0;
00307 }
00308 
00309 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
00310 //PERF_COUNT_START("InitOpenclRunEnv_DS")
00311     if (!isInited) {
00312         // after programs compiled, selects best device
00313         //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
00314         ds_device bestDevice_DS = getDeviceSelection( );
00315 //PERF_COUNT_SUB("called getDeviceSelection()")
00316         cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
00317         // overwrite global static GPUEnv with new device
00318         if (selectedDeviceIsOpenCL() ) {
00319             //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
00320         populateGPUEnvFromDevice( &gpuEnv, bestDevice );
00321         gpuEnv.mnFileCount = 0; //argc;
00322         gpuEnv.mnKernelCount = 0UL;
00323 //PERF_COUNT_SUB("populate gpuEnv")
00324         CompileKernelFile(&gpuEnv, "");
00325 //PERF_COUNT_SUB("CompileKernelFile")
00326         } else {
00327             //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
00328         }
00329         isInited = 1;
00330     }
00331 //PERF_COUNT_END
00332     return 0;
00333 }
00334 
00335 
00336 OpenclDevice::OpenclDevice()
00337 {
00338     //InitEnv();
00339 }
00340 
00341 OpenclDevice::~OpenclDevice()
00342 {
00343     //ReleaseOpenclRunEnv();
00344 }
00345 
00346 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
00347 {
00348     int i = 0;
00349     int clStatus = 0;
00350 
00351     if ( !isInited )
00352     {
00353         return 1;
00354     }
00355 
00356     for ( i = 0; i < gpuEnv.mnFileCount; i++ )
00357     {
00358         if ( gpuEnv.mpArryPrograms[i] )
00359         {
00360             clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
00361             CHECK_OPENCL( clStatus, "clReleaseProgram" );
00362             gpuEnv.mpArryPrograms[i] = NULL;
00363         }
00364     }
00365     if ( gpuEnv.mpCmdQueue )
00366     {
00367         clReleaseCommandQueue( gpuEnv.mpCmdQueue );
00368         gpuEnv.mpCmdQueue = NULL;
00369     }
00370     if ( gpuEnv.mpContext )
00371     {
00372         clReleaseContext( gpuEnv.mpContext );
00373         gpuEnv.mpContext = NULL;
00374     }
00375     isInited = 0;
00376     gpuInfo->mnIsUserCreated = 0;
00377     free( gpuInfo->mpArryDevsID );
00378     return 1;
00379 }
00380 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
00381 {
00382     unsigned int i = 0;
00383     cl_int clStatus;
00384     int status = 0;
00385     char *str = NULL;
00386     FILE *fd = NULL;
00387     char fileName[256] = { 0 }, cl_name[128] = { 0 };
00388     char deviceName[1024];
00389     clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
00390     CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
00391     str = (char*) strstr( clFileName, (char*) ".cl" );
00392     memcpy( cl_name, clFileName, str - clFileName );
00393     cl_name[str - clFileName] = '\0';
00394     sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
00395     legalizeFileName(fileName);
00396     fd = fopen( fileName, "rb" );
00397     status = ( fd != NULL ) ? 1 : 0;
00398     if ( fd != NULL )
00399     {
00400         *fhandle = fd;
00401     }
00402     return status;
00403 
00404 }
00405 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
00406 {
00407     int i;
00408     for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
00409     {
00410         if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
00411         {
00412             if ( gpuEnvCached->mpArryPrograms[i] != NULL )
00413             {
00414                 return 1;
00415             }
00416         }
00417     }
00418 
00419     return 0;
00420 }
00421 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
00422 {
00423     FILE *output = NULL;
00424     output = fopen( fileName, "wb" );
00425     if ( output == NULL )
00426     {
00427         return 0;
00428     }
00429 
00430     fwrite( birary, sizeof(char), numBytes, output );
00431     fclose( output );
00432 
00433     return 1;
00434 
00435 }
00436 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
00437 {
00438     unsigned int i = 0;
00439     cl_int clStatus;
00440     size_t *binarySizes, numDevices=0;
00441     cl_device_id *mpArryDevsID;
00442     char **binaries, *str = NULL;
00443 
00444     clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES,
00445                    sizeof(numDevices), &numDevices, NULL );
00446     CHECK_OPENCL( clStatus, "clGetProgramInfo" );
00447 
00448     mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
00449     if ( mpArryDevsID == NULL )
00450     {
00451         return 0;
00452     }
00453     /* grab the handles to all of the devices in the program. */
00454     clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES,
00455                    sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL );
00456     CHECK_OPENCL( clStatus, "clGetProgramInfo" );
00457 
00458     /* figure out the sizes of each of the binaries. */
00459     binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
00460 
00461     clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES,
00462                    sizeof(size_t) * numDevices, binarySizes, NULL );
00463     CHECK_OPENCL( clStatus, "clGetProgramInfo" );
00464 
00465     /* copy over all of the generated binaries. */
00466     binaries = (char**) malloc( sizeof(char *) * numDevices );
00467     if ( binaries == NULL )
00468     {
00469         return 0;
00470     }
00471 
00472     for ( i = 0; i < numDevices; i++ )
00473     {
00474         if ( binarySizes[i] != 0 )
00475         {
00476             binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
00477             if ( binaries[i] == NULL )
00478             {
00479                 return 0;
00480             }
00481         }
00482         else
00483         {
00484             binaries[i] = NULL;
00485         }
00486     }
00487 
00488     clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES,
00489                    sizeof(char *) * numDevices, binaries, NULL );
00490     CHECK_OPENCL(clStatus,"clGetProgramInfo");
00491 
00492     /* dump out each binary into its own separate file. */
00493     for ( i = 0; i < numDevices; i++ )
00494     {
00495         char fileName[256] = { 0 }, cl_name[128] = { 0 };
00496 
00497         if ( binarySizes[i] != 0 )
00498         {
00499             char deviceName[1024];
00500             clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
00501                            sizeof(deviceName), deviceName, NULL);
00502             CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
00503 
00504             str = (char*) strstr( clFileName, (char*) ".cl" );
00505             memcpy( cl_name, clFileName, str - clFileName );
00506             cl_name[str - clFileName] = '\0';
00507             sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
00508             legalizeFileName(fileName);
00509             if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
00510             {
00511                 printf("[OD] write binary[%s] failed\n", fileName);
00512                 return 0;
00513             } //else
00514             printf("[OD] write binary[%s] successfully\n", fileName);
00515         }
00516     }
00517 
00518     // Release all resouces and memory
00519     for ( i = 0; i < numDevices; i++ )
00520     {
00521         if ( binaries[i] != NULL )
00522         {
00523             free( binaries[i] );
00524             binaries[i] = NULL;
00525         }
00526     }
00527 
00528     if ( binaries != NULL )
00529     {
00530         free( binaries );
00531         binaries = NULL;
00532     }
00533 
00534     if ( binarySizes != NULL )
00535     {
00536         free( binarySizes );
00537         binarySizes = NULL;
00538     }
00539 
00540     if ( mpArryDevsID != NULL )
00541     {
00542         free( mpArryDevsID );
00543         mpArryDevsID = NULL;
00544     }
00545     return 1;
00546 }
00547 
00548 void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus )
00549 {
00550     l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
00551         nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
00552     clFinish( rEnv.mpkCmdQueue );
00553     if (_pValues != NULL)
00554     {
00555         for ( int i = 0; i < (int)nElements; i++ )
00556             pValues[i] = (l_int32)_pValues[i];
00557     }
00558 
00559     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
00560     //clFinish( rEnv.mpkCmdQueue );
00561     return;
00562 }
00563 
00564 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
00565 {
00566 //PERF_COUNT_START("CompileKernelFile")
00567     cl_int clStatus = 0;
00568     size_t length;
00569     char *buildLog = NULL, *binary;
00570     const char *source;
00571     size_t source_size[1];
00572     int b_error, binary_status, binaryExisted, idx;
00573     size_t numDevices;
00574     cl_device_id *mpArryDevsID;
00575     FILE *fd, *fd1;
00576     const char* filename = "kernel.cl";
00577     //fprintf(stderr, "[OD] CompileKernelFile ... \n");
00578     if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
00579     {
00580         return 1;
00581     }
00582 
00583     idx = gpuInfo->mnFileCount;
00584 
00585     source = kernel_src;
00586 
00587     source_size[0] = strlen( source );
00588     binaryExisted = 0;
00589         binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
00590 //PERF_COUNT_SUB("BinaryGenerated")
00591     if ( binaryExisted == 1 )
00592     {
00593         clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
00594                        sizeof(numDevices), &numDevices, NULL );
00595         CHECK_OPENCL( clStatus, "clGetContextInfo" );
00596 
00597         mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
00598         if ( mpArryDevsID == NULL )
00599         {
00600             return 0;
00601         }
00602 //PERF_COUNT_SUB("get numDevices")
00603         b_error = 0;
00604         length = 0;
00605         b_error |= fseek( fd, 0, SEEK_END ) < 0;
00606         b_error |= ( length = ftell(fd) ) <= 0;
00607         b_error |= fseek( fd, 0, SEEK_SET ) < 0;
00608         if ( b_error )
00609         {
00610             return 0;
00611         }
00612 
00613         binary = (char*) malloc( length + 2 );
00614         if ( !binary )
00615         {
00616             return 0;
00617         }
00618 
00619         memset( binary, 0, length + 2 );
00620         b_error |= fread( binary, 1, length, fd ) != length;
00621 
00622 
00623         fclose( fd );
00624 //PERF_COUNT_SUB("read file")
00625         fd = NULL;
00626         // grab the handles to all of the devices in the context.
00627         clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
00628                        sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
00629         CHECK_OPENCL( clStatus, "clGetContextInfo" );
00630 //PERF_COUNT_SUB("get devices")
00631         //fprintf(stderr, "[OD] Create kernel from binary\n");
00632         gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
00633                                            mpArryDevsID, &length, (const unsigned char**) &binary,
00634                                            &binary_status, &clStatus );
00635         CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
00636 //PERF_COUNT_SUB("clCreateProgramWithBinary")
00637         free( binary );
00638         free( mpArryDevsID );
00639         mpArryDevsID = NULL;
00640 //PERF_COUNT_SUB("binaryExisted")
00641     }
00642     else
00643     {
00644         // create a CL program using the kernel source
00645         //fprintf(stderr, "[OD] Create kernel from source\n");
00646         gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
00647                                          source_size, &clStatus);
00648         CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
00649 //PERF_COUNT_SUB("!binaryExisted")
00650     }
00651 
00652     if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL )
00653     {
00654         return 0;
00655     }
00656 
00657     //char options[512];
00658     // create a cl program executable for all the devices specified
00659     //printf("[OD] BuildProgram.\n");
00660 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
00661     if (!gpuInfo->mnIsUserCreated)
00662     {
00663         clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
00664                        buildOption, NULL, NULL);
00665 //PERF_COUNT_SUB("clBuildProgram notUserCreated")
00666     }
00667     else
00668     {
00669         clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
00670                        buildOption, NULL, NULL);
00671 //PERF_COUNT_SUB("clBuildProgram isUserCreated")
00672     }
00673 PERF_COUNT_END
00674     if ( clStatus != CL_SUCCESS )
00675     {
00676         printf ("BuildProgram error!\n");
00677         if ( !gpuInfo->mnIsUserCreated )
00678         {
00679             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
00680                            CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
00681         }
00682         else
00683         {
00684             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
00685                            CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
00686         }
00687         if ( clStatus != CL_SUCCESS )
00688         {
00689             printf("opencl create build log fail\n");
00690             return 0;
00691         }
00692         buildLog = (char*) malloc( length );
00693         if ( buildLog == (char*) NULL )
00694         {
00695             return 0;
00696         }
00697         if ( !gpuInfo->mnIsUserCreated )
00698         {
00699             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
00700                            CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
00701         }
00702         else
00703         {
00704             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
00705                            CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
00706         }
00707         if ( clStatus != CL_SUCCESS )
00708         {
00709             printf("opencl program build info fail\n");
00710             return 0;
00711         }
00712 
00713         fd1 = fopen( "kernel-build.log", "w+" );
00714         if ( fd1 != NULL )
00715         {
00716             fwrite( buildLog, sizeof(char), length, fd1 );
00717             fclose( fd1 );
00718         }
00719 
00720         free( buildLog );
00721 //PERF_COUNT_SUB("build error log")
00722         return 0;
00723     }
00724 
00725     strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
00726 //PERF_COUNT_SUB("strcpy")
00727     if ( binaryExisted == 0 ) {
00728         GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
00729         PERF_COUNT_SUB("GenerateBinFromKernelSource")
00730     }
00731 
00732     gpuInfo->mnFileCount += 1;
00733 //PERF_COUNT_END
00734     return 1;
00735 }
00736 
00737 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
00738 {
00739 PERF_COUNT_START("pixReadFromTiffKernel")
00740     cl_int clStatus;
00741     KernelEnv rEnv;
00742     size_t globalThreads[2];
00743     size_t localThreads[2];
00744     int gsize;
00745     cl_mem valuesCl;
00746     cl_mem outputCl;
00747 
00748     //global and local work dimensions for Horizontal pass
00749     gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
00750     globalThreads[0] = gsize;
00751     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
00752     globalThreads[1] = gsize;
00753     localThreads[0] = GROUPSIZE_X;
00754     localThreads[1] = GROUPSIZE_Y;
00755 
00756     SetKernelEnv( &rEnv );
00757 
00758     l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
00759     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
00760     CHECK_OPENCL( clStatus, "clCreateKernel");
00761 
00762     //Allocate input and output OCL buffers
00763     valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
00764     outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
00765 
00766     //Kernel arguments
00767     clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl );
00768     CHECK_OPENCL( clStatus, "clSetKernelArg");
00769     clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w );
00770     CHECK_OPENCL( clStatus, "clSetKernelArg" );
00771     clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h );
00772     CHECK_OPENCL( clStatus, "clSetKernelArg" );
00773     clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl );
00774     CHECK_OPENCL( clStatus, "clSetKernelArg" );
00775     clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl );
00776     CHECK_OPENCL( clStatus, "clSetKernelArg");
00777 
00778     //Kernel enqueue
00779 PERF_COUNT_SUB("before")
00780     clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL );
00781     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
00782 
00783      /* map results back from gpu */
00784     void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus);
00785     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl");
00786     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
00787 
00788     //Sync
00789     clFinish( rEnv.mpkCmdQueue );
00790 PERF_COUNT_SUB("kernel & map")
00791 PERF_COUNT_END
00792     return pResult;
00793 }
00794 
00795 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
00796 {
00797 PERF_COUNT_START("pixReadTiffCL")
00798     FILE  *fp;
00799 PIX   *pix;
00800 
00801     //printf("pixReadTiffCl file");
00802     PROCNAME("pixReadTiff");
00803 
00804     if (!filename)
00805         return (PIX *)ERROR_PTR("filename not defined", procName, NULL);
00806 
00807     if ((fp = fopenReadStream(filename)) == NULL)
00808         return (PIX *)ERROR_PTR("image file not found", procName, NULL);
00809     if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) {
00810         fclose(fp);
00811         return (PIX *)ERROR_PTR("pix not read", procName, NULL);
00812     }
00813     fclose(fp);
00814 PERF_COUNT_END
00815     return pix;
00816 
00817 }
00818 TIFF *
00819 OpenclDevice::fopenTiffCl(FILE        *fp,
00820           const char  *modestring)
00821 {
00822 l_int32  fd;
00823 
00824     PROCNAME("fopenTiff");
00825 
00826     if (!fp)
00827         return (TIFF *)ERROR_PTR("stream not opened", procName, NULL);
00828     if (!modestring)
00829         return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL);
00830 
00831     if ((fd = fileno(fp)) < 0)
00832         return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL);
00833     lseek(fd, 0, SEEK_SET);
00834 
00835     return TIFFFdOpen(fd, "TIFFstream", modestring);
00836 }
00837 l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF     *tif,
00838                         l_int32  *pxres,
00839                         l_int32  *pyres)
00840 {
00841 l_uint16   resunit;
00842 l_int32    foundxres, foundyres;
00843 l_float32  fxres, fyres;
00844 
00845     PROCNAME("getTiffStreamResolution");
00846 
00847     if (!tif)
00848         return ERROR_INT("tif not opened", procName, 1);
00849     if (!pxres || !pyres)
00850         return ERROR_INT("&xres and &yres not both defined", procName, 1);
00851     *pxres = *pyres = 0;
00852 
00853     TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit);
00854     foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres);
00855     foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres);
00856     if (!foundxres && !foundyres) return 1;
00857     if (!foundxres && foundyres)
00858         fxres = fyres;
00859     else if (foundxres && !foundyres)
00860         fyres = fxres;
00861 
00862     if (resunit == RESUNIT_CENTIMETER) {  /* convert to ppi */
00863         *pxres = (l_int32)(2.54 * fxres + 0.5);
00864         *pyres = (l_int32)(2.54 * fyres + 0.5);
00865     }
00866     else {
00867         *pxres = (l_int32)fxres;
00868         *pyres = (l_int32)fyres;
00869     }
00870 
00871     return 0;
00872 }
00873 
00874 struct L_Memstream
00875 {
00876 l_uint8   *buffer;    /* expands to hold data when written to;         */
00877                         /* fixed size when read from.                    */
00878 size_t     bufsize;   /* current size allocated when written to;       */
00879                         /* fixed size of input data when read from.      */
00880 size_t     offset;    /* byte offset from beginning of buffer.         */
00881 size_t     hw;        /* high-water mark; max bytes in buffer.         */
00882 l_uint8  **poutdata;  /* input param for writing; data goes here.      */
00883 size_t    *poutsize;  /* input param for writing; data size goes here. */
00884 };
00885 typedef struct L_Memstream  L_MEMSTREAM;
00886 
00887 /* These are static functions for memory I/O */
00888 static L_MEMSTREAM *memstreamCreateForRead(l_uint8 *indata, size_t pinsize);
00889 static L_MEMSTREAM *memstreamCreateForWrite(l_uint8 **poutdata,
00890         size_t *poutsize);
00891 static tsize_t tiffReadCallback(thandle_t handle, tdata_t data, tsize_t length);
00892 static tsize_t tiffWriteCallback(thandle_t handle, tdata_t data,
00893         tsize_t length);
00894 static toff_t tiffSeekCallback(thandle_t handle, toff_t offset, l_int32 whence);
00895 static l_int32 tiffCloseCallback(thandle_t handle);
00896 static toff_t tiffSizeCallback(thandle_t handle);
00897 static l_int32 tiffMapCallback(thandle_t handle, tdata_t *data, toff_t *length);
00898 static void tiffUnmapCallback(thandle_t handle, tdata_t data, toff_t length);
00899 
00900 
00901 static L_MEMSTREAM *
00902 memstreamCreateForRead(l_uint8  *indata,
00903 size_t    insize)
00904 {
00905         L_MEMSTREAM  *mstream;
00906 
00907         mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
00908         mstream->buffer = indata;   /* handle to input data array */
00909         mstream->bufsize = insize;  /* amount of input data */
00910         mstream->hw = insize;       /* high-water mark fixed at input data size */
00911         mstream->offset = 0;        /* offset always starts at 0 */
00912         return mstream;
00913 }
00914 
00915 
00916 static L_MEMSTREAM *
00917 memstreamCreateForWrite(l_uint8  **poutdata,
00918 size_t    *poutsize)
00919 {
00920         L_MEMSTREAM  *mstream;
00921 
00922         mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
00923         mstream->buffer = (l_uint8 *)CALLOC(8 * 1024, 1);
00924         mstream->bufsize = 8 * 1024;
00925         mstream->poutdata = poutdata;  /* used only at end of write */
00926         mstream->poutsize = poutsize;  /* ditto  */
00927         mstream->hw = mstream->offset = 0;
00928         return mstream;
00929 }
00930 
00931 
00932 static tsize_t
00933 tiffReadCallback(thandle_t  handle,
00934 tdata_t    data,
00935 tsize_t    length)
00936 {
00937         L_MEMSTREAM  *mstream;
00938         size_t        amount;
00939 
00940         mstream = (L_MEMSTREAM *)handle;
00941         amount = L_MIN((size_t)length, mstream->hw - mstream->offset);
00942         memcpy(data, mstream->buffer + mstream->offset, amount);
00943         mstream->offset += amount;
00944         return amount;
00945 }
00946 
00947 
00948 static tsize_t
00949 tiffWriteCallback(thandle_t  handle,
00950 tdata_t    data,
00951 tsize_t    length)
00952 {
00953         L_MEMSTREAM  *mstream;
00954         size_t        newsize;
00955 
00956         /* reallocNew() uses calloc to initialize the array.
00957         * If malloc is used instead, for some of the encoding methods,
00958         * not all the data in 'bufsize' bytes in the buffer will
00959         * have been initialized by the end of the compression. */
00960         mstream = (L_MEMSTREAM *)handle;
00961         if (mstream->offset + length > mstream->bufsize) {
00962                 newsize = 2 * (mstream->offset + length);
00963                 mstream->buffer = (l_uint8 *)reallocNew((void **)&mstream->buffer,
00964                         mstream->offset, newsize);
00965                 mstream->bufsize = newsize;
00966         }
00967 
00968         memcpy(mstream->buffer + mstream->offset, data, length);
00969         mstream->offset += length;
00970         mstream->hw = L_MAX(mstream->offset, mstream->hw);
00971         return length;
00972 }
00973 
00974 
00975 static toff_t
00976 tiffSeekCallback(thandle_t  handle,
00977 toff_t     offset,
00978 l_int32    whence)
00979 {
00980         L_MEMSTREAM  *mstream;
00981 
00982         PROCNAME("tiffSeekCallback");
00983         mstream = (L_MEMSTREAM *)handle;
00984         switch (whence) {
00985         case SEEK_SET:
00986                 /*            fprintf(stderr, "seek_set: offset = %d\n", offset); */
00987                 mstream->offset = offset;
00988                 break;
00989         case SEEK_CUR:
00990                 /*            fprintf(stderr, "seek_cur: offset = %d\n", offset); */
00991                 mstream->offset += offset;
00992                 break;
00993         case SEEK_END:
00994                 /*            fprintf(stderr, "seek end: hw = %d, offset = %d\n",
00995                 mstream->hw, offset); */
00996                 mstream->offset = mstream->hw - offset;  /* offset >= 0 */
00997                 break;
00998         default:
00999                 return (toff_t)ERROR_INT("bad whence value", procName,
01000                         mstream->offset);
01001         }
01002 
01003         return mstream->offset;
01004 }
01005 
01006 
01007 static l_int32
01008 tiffCloseCallback(thandle_t  handle)
01009 {
01010         L_MEMSTREAM  *mstream;
01011 
01012         mstream = (L_MEMSTREAM *)handle;
01013         if (mstream->poutdata) {   /* writing: save the output data */
01014                 *mstream->poutdata = mstream->buffer;
01015                 *mstream->poutsize = mstream->hw;
01016         }
01017         FREE(mstream);  /* never free the buffer! */
01018         return 0;
01019 }
01020 
01021 
01022 static toff_t
01023 tiffSizeCallback(thandle_t  handle)
01024 {
01025         L_MEMSTREAM  *mstream;
01026 
01027         mstream = (L_MEMSTREAM *)handle;
01028         return mstream->hw;
01029 }
01030 
01031 
01032 static l_int32
01033 tiffMapCallback(thandle_t  handle,
01034 tdata_t   *data,
01035 toff_t    *length)
01036 {
01037         L_MEMSTREAM  *mstream;
01038 
01039         mstream = (L_MEMSTREAM *)handle;
01040         *data = mstream->buffer;
01041         *length = mstream->hw;
01042         return 0;
01043 }
01044 
01045 
01046 static void
01047 tiffUnmapCallback(thandle_t  handle,
01048 tdata_t    data,
01049 toff_t     length)
01050 {
01051         return;
01052 }
01053 
01054 
01071 static TIFF *
01072 fopenTiffMemstream(const char  *filename,
01073 const char  *operation,
01074 l_uint8    **pdata,
01075 size_t      *pdatasize)
01076 {
01077         L_MEMSTREAM  *mstream;
01078 
01079         PROCNAME("fopenTiffMemstream");
01080 
01081         if (!filename)
01082                 return (TIFF *)ERROR_PTR("filename not defined", procName, NULL);
01083         if (!operation)
01084                 return (TIFF *)ERROR_PTR("operation not defined", procName, NULL);
01085         if (!pdata)
01086                 return (TIFF *)ERROR_PTR("&data not defined", procName, NULL);
01087         if (!pdatasize)
01088                 return (TIFF *)ERROR_PTR("&datasize not defined", procName, NULL);
01089         if (!strcmp(operation, "r") && !strcmp(operation, "w"))
01090                 return (TIFF *)ERROR_PTR("operation not 'r' or 'w'}", procName, NULL);
01091 
01092         if (!strcmp(operation, "r"))
01093                 mstream = memstreamCreateForRead(*pdata, *pdatasize);
01094         else
01095                 mstream = memstreamCreateForWrite(pdata, pdatasize);
01096 
01097         return TIFFClientOpen(filename, operation, mstream,
01098                 tiffReadCallback, tiffWriteCallback,
01099                 tiffSeekCallback, tiffCloseCallback,
01100                 tiffSizeCallback, tiffMapCallback,
01101                 tiffUnmapCallback);
01102 }
01103 
01104 
01105 
01106 PIX *
01107 OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32  n)
01108 {
01109         l_int32  i, pagefound;
01110         PIX     *pix;
01111         TIFF    *tif;
01112         //L_MEMSTREAM *memStream;
01113         PROCNAME("pixReadMemTiffCl");
01114 
01115         if (!data)
01116                 return (PIX *)ERROR_PTR("data pointer is NULL", procName, NULL);
01117 
01118         if ((tif = fopenTiffMemstream("", "r", (l_uint8 **)&data, &size)) == NULL)
01119                 return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
01120 
01121         pagefound = FALSE;
01122         pix = NULL;
01123         for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
01124                 if (i == n) {
01125                         pagefound = TRUE;
01126                         if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
01127                                 TIFFCleanup(tif);
01128                                 return (PIX *)ERROR_PTR("pix not read", procName, NULL);
01129                         }
01130                         break;
01131                 }
01132                 if (TIFFReadDirectory(tif) == 0)
01133                         break;
01134         }
01135 
01136         if (pagefound == FALSE) {
01137                 L_WARNING("tiff page %d not found", procName);
01138                 TIFFCleanup(tif);
01139                 return NULL;
01140         }
01141 
01142         TIFFCleanup(tif);
01143         return pix;
01144 }
01145 
01146 PIX *
01147 OpenclDevice::pixReadStreamTiffCl(FILE    *fp,
01148                   l_int32  n)
01149 {
01150 l_int32  i, pagefound;
01151 PIX     *pix;
01152 TIFF    *tif;
01153 
01154     PROCNAME("pixReadStreamTiff");
01155 
01156     if (!fp)
01157         return (PIX *)ERROR_PTR("stream not defined", procName, NULL);
01158 
01159     if ((tif = fopenTiffCl(fp, "rb")) == NULL)
01160         return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
01161 
01162     pagefound = FALSE;
01163     pix = NULL;
01164     for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
01165         if (i == n) {
01166             pagefound = TRUE;
01167             if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
01168                 TIFFCleanup(tif);
01169                 return (PIX *)ERROR_PTR("pix not read", procName, NULL);
01170             }
01171             break;
01172         }
01173         if (TIFFReadDirectory(tif) == 0)
01174             break;
01175     }
01176 
01177     if (pagefound == FALSE) {
01178         L_WARNING("tiff page %d not found", procName, n);
01179         TIFFCleanup(tif);
01180         return NULL;
01181     }
01182 
01183     TIFFCleanup(tif);
01184     return pix;
01185 }
01186 
01187 static l_int32
01188 getTiffCompressedFormat(l_uint16  tiffcomp)
01189 {
01190 l_int32  comptype;
01191 
01192     switch (tiffcomp)
01193     {
01194     case COMPRESSION_CCITTFAX4:
01195         comptype = IFF_TIFF_G4;
01196         break;
01197     case COMPRESSION_CCITTFAX3:
01198         comptype = IFF_TIFF_G3;
01199         break;
01200     case COMPRESSION_CCITTRLE:
01201         comptype = IFF_TIFF_RLE;
01202         break;
01203     case COMPRESSION_PACKBITS:
01204         comptype = IFF_TIFF_PACKBITS;
01205         break;
01206     case COMPRESSION_LZW:
01207         comptype = IFF_TIFF_LZW;
01208         break;
01209     case COMPRESSION_ADOBE_DEFLATE:
01210         comptype = IFF_TIFF_ZIP;
01211         break;
01212     default:
01213         comptype = IFF_TIFF;
01214         break;
01215     }
01216     return comptype;
01217 }
01218 
01219 void compare(l_uint32  *cpu, l_uint32  *gpu,int size)
01220 {
01221     for(int i=0;i<size;i++)
01222     {
01223         if(cpu[i]!=gpu[i])
01224         {
01225             printf("\ndoesnot match\n");
01226             return;
01227         }
01228     }
01229     printf("\nit matches\n");
01230 
01231 }
01232 
01233 //OpenCL implementation of pixReadFromTiffStream.
01234 //Similar to the CPU implentation of pixReadFromTiffStream
01235 PIX *
01236 OpenclDevice::pixReadFromTiffStreamCl(TIFF  *tif)
01237 {
01238 l_uint8   *linebuf, *data;
01239 l_uint16   spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation;
01240 l_uint16  *redmap, *greenmap, *bluemap;
01241 l_int32    d, wpl, bpl, comptype, i, ncolors;
01242 l_int32    xres, yres;
01243 l_uint32   w, h;
01244 l_uint32  *line, *tiffdata;
01245 PIX       *pix;
01246 PIXCMAP   *cmap;
01247 
01248     PROCNAME("pixReadFromTiffStream");
01249 
01250     if (!tif)
01251         return (PIX *)ERROR_PTR("tif not defined", procName, NULL);
01252 
01253 
01254     TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps);
01255     TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
01256     bpp = bps * spp;
01257     if (bpp > 32)
01258         return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL);
01259     if (spp == 1)
01260         d = bps;
01261     else if (spp == 3 || spp == 4)
01262         d = 32;
01263     else
01264         return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL);
01265 
01266     TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w);
01267     TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h);
01268     tiffbpl = TIFFScanlineSize(tif);
01269 
01270     if ((pix = pixCreate(w, h, d)) == NULL)
01271         return (PIX *)ERROR_PTR("pix not made", procName, NULL);
01272     data = (l_uint8 *)pixGetData(pix);
01273     wpl = pixGetWpl(pix);
01274     bpl = 4 * wpl;
01275 
01276 
01277     if (spp == 1) {
01278         if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL)
01279             return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL);
01280 
01281         for (i = 0 ; i < h ; i++) {
01282             if (TIFFReadScanline(tif, linebuf, i, 0) < 0) {
01283                 FREE(linebuf);
01284                 pixDestroy(&pix);
01285                 return (PIX *)ERROR_PTR("line read fail", procName, NULL);
01286             }
01287             memcpy((char *)data, (char *)linebuf, tiffbpl);
01288             data += bpl;
01289         }
01290         if (bps <= 8)
01291             pixEndianByteSwap(pix);
01292         else
01293             pixEndianTwoByteSwap(pix);
01294         FREE(linebuf);
01295     }
01296     else {
01297         if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) {
01298             pixDestroy(&pix);
01299             return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL);
01300         }
01301         if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata,
01302                                        ORIENTATION_TOPLEFT, 0)) {
01303             FREE(tiffdata);
01304             pixDestroy(&pix);
01305             return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL);
01306         }
01307         line = pixGetData(pix);
01308 
01309         //Invoke the OpenCL kernel for pixReadFromTiff
01310         l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
01311 
01312         pixSetData(pix, output_gpu);
01313         // pix already has data allocated, it now points to output_gpu?
01314         FREE(tiffdata);
01315         FREE(line);
01316         //FREE(output_gpu);
01317     }
01318 
01319     if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
01320         pixSetXRes(pix, xres);
01321         pixSetYRes(pix, yres);
01322     }
01323 
01324 
01325     TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp);
01326     comptype = getTiffCompressedFormat(tiffcomp);
01327     pixSetInputFormat(pix, comptype);
01328 
01329     if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) {
01330 
01331         if ((cmap = pixcmapCreate(bps)) == NULL) {
01332             pixDestroy(&pix);
01333             return (PIX *)ERROR_PTR("cmap not made", procName, NULL);
01334         }
01335         ncolors = 1 << bps;
01336         for (i = 0; i < ncolors; i++)
01337             pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8,
01338                             bluemap[i] >> 8);
01339         pixSetColormap(pix, cmap);
01340     }
01341     else {
01342         if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) {
01343 
01344             if (tiffcomp == COMPRESSION_CCITTFAX3 ||
01345                 tiffcomp == COMPRESSION_CCITTFAX4 ||
01346                 tiffcomp == COMPRESSION_CCITTRLE ||
01347                 tiffcomp == COMPRESSION_CCITTRLEW) {
01348                 photometry = PHOTOMETRIC_MINISWHITE;
01349             }
01350             else
01351                 photometry = PHOTOMETRIC_MINISBLACK;
01352         }
01353         if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) ||
01354             (d == 8 && photometry == PHOTOMETRIC_MINISWHITE))
01355             pixInvert(pix, pix);
01356     }
01357 
01358     if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) {
01359         if (orientation >= 1 && orientation <= 8) {
01360             struct tiff_transform *transform =
01361               &tiff_orientation_transforms[orientation - 1];
01362             if (transform->vflip) pixFlipTB(pix, pix);
01363             if (transform->hflip) pixFlipLR(pix, pix);
01364             if (transform->rotate) {
01365                 PIX *oldpix = pix;
01366                 pix = pixRotate90(oldpix, transform->rotate);
01367                 pixDestroy(&oldpix);
01368            }
01369         }
01370     }
01371 
01372     return pix;
01373 }
01374 
01375 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
01376 cl_int
01377 pixDilateCL_55(l_int32  wpl, l_int32  h)
01378 {
01379     size_t globalThreads[2];
01380     cl_mem pixtemp;
01381     cl_int status;
01382     int gsize;
01383     size_t localThreads[2];
01384 
01385     //Horizontal pass
01386     gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
01387     globalThreads[0] = gsize;
01388     globalThreads[1] = GROUPSIZE_HMORY;
01389     localThreads[0] = GROUPSIZE_HMORX;
01390     localThreads[1] = GROUPSIZE_HMORY;
01391 
01392     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
01393 
01394     status = clSetKernelArg(rEnv.mpkKernel,
01395         0,
01396         sizeof(cl_mem),
01397         &pixsCLBuffer);
01398     status = clSetKernelArg(rEnv.mpkKernel,
01399         1,
01400         sizeof(cl_mem),
01401         &pixdCLBuffer);
01402     status = clSetKernelArg(rEnv.mpkKernel,
01403         2,
01404         sizeof(wpl),
01405         (const void *)&wpl);
01406     status = clSetKernelArg(rEnv.mpkKernel,
01407         3,
01408         sizeof(h),
01409         (const void *)&h);
01410 
01411     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01412                             rEnv.mpkKernel,
01413                             2,
01414                             NULL,
01415                             globalThreads,
01416                             localThreads,
01417                             0,
01418                             NULL,
01419                             NULL);
01420 
01421     //Swap source and dest buffers
01422     pixtemp = pixsCLBuffer;
01423     pixsCLBuffer = pixdCLBuffer;
01424     pixdCLBuffer = pixtemp;
01425 
01426     //Vertical
01427     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01428     globalThreads[0] = gsize;
01429     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01430     globalThreads[1] = gsize;
01431     localThreads[0] = GROUPSIZE_X;
01432     localThreads[1] = GROUPSIZE_Y;
01433 
01434     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
01435 
01436     status = clSetKernelArg(rEnv.mpkKernel,
01437         0,
01438         sizeof(cl_mem),
01439         &pixsCLBuffer);
01440     status = clSetKernelArg(rEnv.mpkKernel,
01441         1,
01442         sizeof(cl_mem),
01443         &pixdCLBuffer);
01444     status = clSetKernelArg(rEnv.mpkKernel,
01445         2,
01446         sizeof(wpl),
01447         (const void *)&wpl);
01448     status = clSetKernelArg(rEnv.mpkKernel,
01449         3,
01450         sizeof(h),
01451         (const void *)&h);
01452     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01453                             rEnv.mpkKernel,
01454                             2,
01455                             NULL,
01456                             globalThreads,
01457                             localThreads,
01458                             0,
01459                             NULL,
01460                             NULL);
01461 
01462     return status;
01463 }
01464 
01465 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
01466 cl_int
01467 pixErodeCL_55(l_int32  wpl, l_int32  h)
01468 {
01469     size_t globalThreads[2];
01470     cl_mem pixtemp;
01471     cl_int status;
01472     int gsize;
01473     l_uint32 fwmask, lwmask;
01474     size_t localThreads[2];
01475 
01476     lwmask = lmask32[32 - 2];
01477     fwmask = rmask32[32 - 2];
01478 
01479     //Horizontal pass
01480     gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
01481     globalThreads[0] = gsize;
01482     globalThreads[1] = GROUPSIZE_HMORY;
01483     localThreads[0] = GROUPSIZE_HMORX;
01484     localThreads[1] = GROUPSIZE_HMORY;
01485 
01486     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
01487 
01488     status = clSetKernelArg(rEnv.mpkKernel,
01489         0,
01490         sizeof(cl_mem),
01491         &pixsCLBuffer);
01492     status = clSetKernelArg(rEnv.mpkKernel,
01493         1,
01494         sizeof(cl_mem),
01495         &pixdCLBuffer);
01496     status = clSetKernelArg(rEnv.mpkKernel,
01497         2,
01498         sizeof(wpl),
01499         (const void *)&wpl);
01500     status = clSetKernelArg(rEnv.mpkKernel,
01501         3,
01502         sizeof(h),
01503         (const void *)&h);
01504 
01505     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01506                             rEnv.mpkKernel,
01507                             2,
01508                             NULL,
01509                             globalThreads,
01510                             localThreads,
01511                             0,
01512                             NULL,
01513                             NULL);
01514 
01515     //Swap source and dest buffers
01516     pixtemp = pixsCLBuffer;
01517     pixsCLBuffer = pixdCLBuffer;
01518     pixdCLBuffer = pixtemp;
01519 
01520     //Vertical
01521     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01522     globalThreads[0] = gsize;
01523     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01524     globalThreads[1] = gsize;
01525     localThreads[0] = GROUPSIZE_X;
01526     localThreads[1] = GROUPSIZE_Y;
01527 
01528     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
01529 
01530     status = clSetKernelArg(rEnv.mpkKernel,
01531         0,
01532         sizeof(cl_mem),
01533         &pixsCLBuffer);
01534     status = clSetKernelArg(rEnv.mpkKernel,
01535         1,
01536         sizeof(cl_mem),
01537         &pixdCLBuffer);
01538     status = clSetKernelArg(rEnv.mpkKernel,
01539         2,
01540         sizeof(wpl),
01541         (const void *)&wpl);
01542     status = clSetKernelArg(rEnv.mpkKernel,
01543         3,
01544         sizeof(h),
01545         (const void *)&h);
01546     status = clSetKernelArg(rEnv.mpkKernel,
01547         4,
01548         sizeof(fwmask),
01549         (const void *)&fwmask);
01550     status = clSetKernelArg(rEnv.mpkKernel,
01551         5,
01552         sizeof(lwmask),
01553         (const void *)&lwmask);
01554     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01555                             rEnv.mpkKernel,
01556                             2,
01557                             NULL,
01558                             globalThreads,
01559                             localThreads,
01560                             0,
01561                             NULL,
01562                             NULL);
01563 
01564     return status;
01565 }
01566 
01567 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
01568 cl_int
01569 pixDilateCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
01570 {
01571     l_int32  xp, yp, xn, yn;
01572     SEL* sel;
01573     size_t globalThreads[2];
01574     cl_mem pixtemp;
01575     cl_int status;
01576     int gsize;
01577     size_t localThreads[2];
01578     char isEven;
01579 
01580     OpenclDevice::SetKernelEnv( &rEnv );
01581 
01582     if (hsize == 5 && vsize == 5)
01583     {
01584         //Specific case for 5x5
01585         status = pixDilateCL_55(wpl, h);
01586         return status;
01587     }
01588 
01589     sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
01590 
01591     selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
01592     selDestroy(&sel);
01593     //global and local work dimensions for Horizontal pass
01594     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01595     globalThreads[0] = gsize;
01596     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01597     globalThreads[1] = gsize;
01598     localThreads[0] = GROUPSIZE_X;
01599     localThreads[1] = GROUPSIZE_Y;
01600 
01601     if (xp > 31 || xn > 31)
01602     {
01603         //Generic case.
01604         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status );
01605 
01606         status = clSetKernelArg(rEnv.mpkKernel,
01607             0,
01608             sizeof(cl_mem),
01609             &pixsCLBuffer);
01610         status = clSetKernelArg(rEnv.mpkKernel,
01611             1,
01612             sizeof(cl_mem),
01613             &pixdCLBuffer);
01614         status = clSetKernelArg(rEnv.mpkKernel,
01615                 2,
01616                 sizeof(xp),
01617                 (const void *)&xp);
01618         status = clSetKernelArg(rEnv.mpkKernel,
01619                 3,
01620                 sizeof(xn),
01621                 (const void *)&xn);
01622         status = clSetKernelArg(rEnv.mpkKernel,
01623                 4,
01624                 sizeof(wpl),
01625                 (const void *)&wpl);
01626         status = clSetKernelArg(rEnv.mpkKernel,
01627                 5,
01628                 sizeof(h),
01629                 (const void *)&h);
01630         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01631                                 rEnv.mpkKernel,
01632                                 2,
01633                                 NULL,
01634                                 globalThreads,
01635                                 localThreads,
01636                                 0,
01637                                 NULL,
01638                                 NULL);
01639 
01640         if (yp > 0 || yn > 0)
01641         {
01642             pixtemp = pixsCLBuffer;
01643             pixsCLBuffer = pixdCLBuffer;
01644             pixdCLBuffer = pixtemp;
01645         }
01646     }
01647     else if (xp > 0 || xn > 0 )
01648     {
01649         //Specific Horizontal pass kernel for half width < 32
01650         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status );
01651         isEven = (xp != xn);
01652 
01653         status = clSetKernelArg(rEnv.mpkKernel,
01654             0,
01655             sizeof(cl_mem),
01656             &pixsCLBuffer);
01657         status = clSetKernelArg(rEnv.mpkKernel,
01658             1,
01659             sizeof(cl_mem),
01660             &pixdCLBuffer);
01661         status = clSetKernelArg(rEnv.mpkKernel,
01662                 2,
01663                 sizeof(xp),
01664                 (const void *)&xp);
01665         status = clSetKernelArg(rEnv.mpkKernel,
01666                 3,
01667                 sizeof(wpl),
01668                 (const void *)&wpl);
01669         status = clSetKernelArg(rEnv.mpkKernel,
01670                 4,
01671                 sizeof(h),
01672                 (const void *)&h);
01673         status = clSetKernelArg(rEnv.mpkKernel,
01674                 5,
01675                 sizeof(isEven),
01676                 (const void *)&isEven);
01677         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01678                                 rEnv.mpkKernel,
01679                                 2,
01680                                 NULL,
01681                                 globalThreads,
01682                                 localThreads,
01683                                 0,
01684                                 NULL,
01685                                 NULL);
01686 
01687         if (yp > 0 || yn > 0)
01688         {
01689             pixtemp = pixsCLBuffer;
01690             pixsCLBuffer = pixdCLBuffer;
01691             pixdCLBuffer = pixtemp;
01692         }
01693     }
01694 
01695     if (yp > 0 || yn > 0)
01696     {
01697         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
01698 
01699         status = clSetKernelArg(rEnv.mpkKernel,
01700             0,
01701             sizeof(cl_mem),
01702             &pixsCLBuffer);
01703         status = clSetKernelArg(rEnv.mpkKernel,
01704             1,
01705             sizeof(cl_mem),
01706             &pixdCLBuffer);
01707         status = clSetKernelArg(rEnv.mpkKernel,
01708                 2,
01709                 sizeof(yp),
01710                 (const void *)&yp);
01711         status = clSetKernelArg(rEnv.mpkKernel,
01712                 3,
01713                 sizeof(wpl),
01714                 (const void *)&wpl);
01715         status = clSetKernelArg(rEnv.mpkKernel,
01716                 4,
01717                 sizeof(h),
01718                 (const void *)&h);
01719         status = clSetKernelArg(rEnv.mpkKernel,
01720                 5,
01721                 sizeof(yn),
01722                 (const void *)&yn);
01723         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01724                                 rEnv.mpkKernel,
01725                                 2,
01726                                 NULL,
01727                                 globalThreads,
01728                                 localThreads,
01729                                 0,
01730                                 NULL,
01731                                 NULL);
01732     }
01733 
01734 
01735     return status;
01736 }
01737 
01738 //Morphology Erode operation. Invokes the relevant OpenCL kernels
01739 cl_int
01740 pixErodeCL(l_int32  hsize, l_int32  vsize, l_uint32 wpl, l_uint32 h)
01741 {
01742 
01743     l_int32  xp, yp, xn, yn;
01744     SEL* sel;
01745     size_t globalThreads[2];
01746     size_t localThreads[2];
01747     cl_mem pixtemp;
01748     cl_int status;
01749     int gsize;
01750     char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
01751     l_uint32 rwmask, lwmask;
01752     char isEven;
01753 
01754     sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
01755 
01756     selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
01757     selDestroy(&sel);
01758     OpenclDevice::SetKernelEnv( &rEnv );
01759 
01760     if (hsize == 5 && vsize == 5 && isAsymmetric)
01761     {
01762         //Specific kernel for 5x5
01763         status = pixErodeCL_55(wpl, h);
01764         return status;
01765     }
01766 
01767     rwmask = rmask32[32 - (xp & 31)];
01768     lwmask = lmask32[32 - (xn & 31)];
01769 
01770     //global and local work dimensions for Horizontal pass
01771     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01772     globalThreads[0] = gsize;
01773     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01774     globalThreads[1] = gsize;
01775     localThreads[0] = GROUPSIZE_X;
01776     localThreads[1] = GROUPSIZE_Y;
01777 
01778     //Horizontal Pass
01779     if (xp > 31 || xn > 31 )
01780     {
01781         //Generic case.
01782         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status );
01783 
01784         status = clSetKernelArg(rEnv.mpkKernel,
01785             0,
01786             sizeof(cl_mem),
01787             &pixsCLBuffer);
01788         status = clSetKernelArg(rEnv.mpkKernel,
01789             1,
01790             sizeof(cl_mem),
01791             &pixdCLBuffer);
01792         status = clSetKernelArg(rEnv.mpkKernel,
01793                 2,
01794                 sizeof(xp),
01795                 (const void *)&xp);
01796         status = clSetKernelArg(rEnv.mpkKernel,
01797                 3,
01798                 sizeof(xn),
01799                 (const void *)&xn);
01800         status = clSetKernelArg(rEnv.mpkKernel,
01801                 4,
01802                 sizeof(wpl),
01803                 (const void *)&wpl);
01804         status = clSetKernelArg(rEnv.mpkKernel,
01805                 5,
01806                 sizeof(h),
01807                 (const void *)&h);
01808         status = clSetKernelArg(rEnv.mpkKernel,
01809                 6,
01810                 sizeof(isAsymmetric),
01811                 (const void *)&isAsymmetric);
01812         status = clSetKernelArg(rEnv.mpkKernel,
01813                 7,
01814                 sizeof(rwmask),
01815                 (const void *)&rwmask);
01816         status = clSetKernelArg(rEnv.mpkKernel,
01817                 8,
01818                 sizeof(lwmask),
01819                 (const void *)&lwmask);
01820         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01821                                 rEnv.mpkKernel,
01822                                 2,
01823                                 NULL,
01824                                 globalThreads,
01825                                 localThreads,
01826                                 0,
01827                                 NULL,
01828                                 NULL);
01829 
01830         if (yp > 0 || yn > 0)
01831         {
01832             pixtemp = pixsCLBuffer;
01833             pixsCLBuffer = pixdCLBuffer;
01834             pixdCLBuffer = pixtemp;
01835         }
01836     }
01837     else if (xp > 0 || xn > 0)
01838     {
01839         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status );
01840         isEven = (xp != xn);
01841 
01842         status = clSetKernelArg(rEnv.mpkKernel,
01843             0,
01844             sizeof(cl_mem),
01845             &pixsCLBuffer);
01846         status = clSetKernelArg(rEnv.mpkKernel,
01847             1,
01848             sizeof(cl_mem),
01849             &pixdCLBuffer);
01850         status = clSetKernelArg(rEnv.mpkKernel,
01851                 2,
01852                 sizeof(xp),
01853                 (const void *)&xp);
01854         status = clSetKernelArg(rEnv.mpkKernel,
01855                 3,
01856                 sizeof(wpl),
01857                 (const void *)&wpl);
01858         status = clSetKernelArg(rEnv.mpkKernel,
01859                 4,
01860                 sizeof(h),
01861                 (const void *)&h);
01862         status = clSetKernelArg(rEnv.mpkKernel,
01863                 5,
01864                 sizeof(isAsymmetric),
01865                 (const void *)&isAsymmetric);
01866         status = clSetKernelArg(rEnv.mpkKernel,
01867                 6,
01868                 sizeof(rwmask),
01869                 (const void *)&rwmask);
01870         status = clSetKernelArg(rEnv.mpkKernel,
01871                 7,
01872                 sizeof(lwmask),
01873                 (const void *)&lwmask);
01874         status = clSetKernelArg(rEnv.mpkKernel,
01875                 8,
01876                 sizeof(isEven),
01877                 (const void *)&isEven);
01878         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01879                                 rEnv.mpkKernel,
01880                                 2,
01881                                 NULL,
01882                                 globalThreads,
01883                                 localThreads,
01884                                 0,
01885                                 NULL,
01886                                 NULL);
01887 
01888         if (yp > 0 || yn > 0)
01889         {
01890             pixtemp = pixsCLBuffer;
01891             pixsCLBuffer = pixdCLBuffer;
01892             pixdCLBuffer = pixtemp;
01893         }
01894     }
01895 
01896     //Vertical Pass
01897     if (yp > 0 || yn > 0)
01898     {
01899         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status );
01900 
01901         status = clSetKernelArg(rEnv.mpkKernel,
01902             0,
01903             sizeof(cl_mem),
01904             &pixsCLBuffer);
01905         status = clSetKernelArg(rEnv.mpkKernel,
01906             1,
01907             sizeof(cl_mem),
01908             &pixdCLBuffer);
01909         status = clSetKernelArg(rEnv.mpkKernel,
01910                 2,
01911                 sizeof(yp),
01912                 (const void *)&yp);
01913         status = clSetKernelArg(rEnv.mpkKernel,
01914                 3,
01915                 sizeof(wpl),
01916                 (const void *)&wpl);
01917         status = clSetKernelArg(rEnv.mpkKernel,
01918                 4,
01919                 sizeof(h),
01920                 (const void *)&h);
01921         status = clSetKernelArg(rEnv.mpkKernel,
01922                 5,
01923                 sizeof(isAsymmetric),
01924                 (const void *)&isAsymmetric);
01925         status = clSetKernelArg(rEnv.mpkKernel,
01926                 6,
01927                 sizeof(yn),
01928                 (const void *)&yn);
01929         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01930                                 rEnv.mpkKernel,
01931                                 2,
01932                                 NULL,
01933                                 globalThreads,
01934                                 localThreads,
01935                                 0,
01936                                 NULL,
01937                                 NULL);
01938     }
01939 
01940     return status;
01941 }
01942 
01943 // OpenCL implementation of Morphology Dilate
01944 //Note: Assumes the source and dest opencl buffer are initialized. No check done
01945 PIX*
01946 OpenclDevice::pixDilateBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
01947 {
01948     l_uint32 wpl, h;
01949 
01950     wpl = pixGetWpl(pixs);
01951     h = pixGetHeight(pixs);
01952 
01953     clStatus = pixDilateCL(hsize, vsize, wpl, h);
01954 
01955     if (reqDataCopy)
01956     {
01957         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false);
01958     }
01959 
01960     return pixd;
01961 }
01962 
01963 // OpenCL implementation of Morphology Erode
01964 //Note: Assumes the source and dest opencl buffer are initialized. No check done
01965 PIX*
01966 OpenclDevice::pixErodeBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
01967 {
01968     l_uint32 wpl, h;
01969 
01970     wpl = pixGetWpl(pixs);
01971     h = pixGetHeight(pixs);
01972 
01973     clStatus = pixErodeCL(hsize, vsize, wpl, h);
01974 
01975     if (reqDataCopy)
01976     {
01977         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
01978     }
01979 
01980     return pixd;
01981 }
01982 
01983 //Morphology Open operation. Invokes the relevant OpenCL kernels
01984 cl_int
01985 pixOpenCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
01986 {
01987     cl_int status;
01988     cl_mem pixtemp;
01989 
01990     //Erode followed by Dilate
01991     status = pixErodeCL(hsize, vsize, wpl, h);
01992 
01993     pixtemp = pixsCLBuffer;
01994     pixsCLBuffer = pixdCLBuffer;
01995     pixdCLBuffer = pixtemp;
01996 
01997     status = pixDilateCL(hsize, vsize, wpl, h);
01998 
01999     return status;
02000 }
02001 
02002 //Morphology Close operation. Invokes the relevant OpenCL kernels
02003 cl_int
02004 pixCloseCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
02005 {
02006     cl_int status;
02007     cl_mem pixtemp;
02008 
02009     //Dilate followed by Erode
02010     status = pixDilateCL(hsize, vsize, wpl, h);
02011 
02012     pixtemp = pixsCLBuffer;
02013     pixsCLBuffer = pixdCLBuffer;
02014     pixdCLBuffer = pixtemp;
02015 
02016     status = pixErodeCL(hsize, vsize, wpl, h);
02017 
02018     return status;
02019 }
02020 
02021 // OpenCL implementation of Morphology Close
02022 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02023 PIX*
02024 OpenclDevice::pixCloseBrickCL(PIX  *pixd,
02025                               PIX  *pixs,
02026                               l_int32  hsize,
02027                               l_int32  vsize,
02028                               bool reqDataCopy = false)
02029 {
02030     l_uint32 wpl, h;
02031 
02032     wpl = pixGetWpl(pixs);
02033     h = pixGetHeight(pixs);
02034 
02035     clStatus = pixCloseCL(hsize, vsize, wpl, h);
02036 
02037     if (reqDataCopy)
02038     {
02039         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
02040     }
02041 
02042     return pixd;
02043 }
02044 
02045 // OpenCL implementation of Morphology Open
02046 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02047 PIX*
02048 OpenclDevice::pixOpenBrickCL(PIX  *pixd,
02049                               PIX  *pixs,
02050                               l_int32  hsize,
02051                               l_int32  vsize,
02052                               bool reqDataCopy = false)
02053 {
02054     l_uint32 wpl, h;
02055 
02056     wpl = pixGetWpl(pixs);
02057     h = pixGetHeight(pixs);
02058 
02059     clStatus = pixOpenCL(hsize, vsize, wpl, h);
02060 
02061     if (reqDataCopy)
02062     {
02063         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
02064     }
02065 
02066     return pixd;
02067 }
02068 
02069 //pix OR operation: outbuffer = buffer1 | buffer2
02070 cl_int
02071 pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
02072 {
02073     cl_int status;
02074     size_t globalThreads[2];
02075     int gsize;
02076     size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
02077 
02078     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
02079     globalThreads[0] = gsize;
02080     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
02081     globalThreads[1] = gsize;
02082 
02083     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status );
02084 
02085     status = clSetKernelArg(rEnv.mpkKernel,
02086         0,
02087         sizeof(cl_mem),
02088         &buffer1);
02089     status = clSetKernelArg(rEnv.mpkKernel,
02090         1,
02091         sizeof(cl_mem),
02092         &buffer2);
02093     status = clSetKernelArg(rEnv.mpkKernel,
02094         2,
02095         sizeof(cl_mem),
02096         &outbuffer);
02097     status = clSetKernelArg(rEnv.mpkKernel,
02098             3,
02099             sizeof(wpl),
02100             (const void *)&wpl);
02101     status = clSetKernelArg(rEnv.mpkKernel,
02102             4,
02103             sizeof(h),
02104             (const void *)&h);
02105     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
02106                             rEnv.mpkKernel,
02107                             2,
02108                             NULL,
02109                             globalThreads,
02110                             localThreads,
02111                             0,
02112                             NULL,
02113                             NULL);
02114 
02115     return status;
02116 }
02117 
02118 //pix AND operation: outbuffer = buffer1 & buffer2
02119 cl_int
02120 pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
02121 {
02122     cl_int status;
02123     size_t globalThreads[2];
02124     int gsize;
02125     size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
02126 
02127     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
02128     globalThreads[0] = gsize;
02129     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
02130     globalThreads[1] = gsize;
02131 
02132     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status );
02133 
02134     // Enqueue a kernel run call.
02135     status = clSetKernelArg(rEnv.mpkKernel,
02136         0,
02137         sizeof(cl_mem),
02138         &buffer1);
02139     status = clSetKernelArg(rEnv.mpkKernel,
02140         1,
02141         sizeof(cl_mem),
02142         &buffer2);
02143     status = clSetKernelArg(rEnv.mpkKernel,
02144         2,
02145         sizeof(cl_mem),
02146         &outbuffer);
02147     status = clSetKernelArg(rEnv.mpkKernel,
02148             3,
02149             sizeof(wpl),
02150             (const void *)&wpl);
02151     status = clSetKernelArg(rEnv.mpkKernel,
02152             4,
02153             sizeof(h),
02154             (const void *)&h);
02155     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
02156                             rEnv.mpkKernel,
02157                             2,
02158                             NULL,
02159                             globalThreads,
02160                             localThreads,
02161                             0,
02162                             NULL,
02163                             NULL);
02164 
02165     return status;
02166 }
02167 
02168 //output = buffer1 & ~(buffer2)
02169 cl_int
02170 pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL)
02171 {
02172     cl_int status;
02173     size_t globalThreads[2];
02174     int gsize;
02175     size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
02176 
02177     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
02178     globalThreads[0] = gsize;
02179     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
02180     globalThreads[1] = gsize;
02181 
02182     if (outBuffer != NULL)
02183     {
02184         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status );
02185     }
02186     else
02187     {
02188         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status );
02189     }
02190 
02191     // Enqueue a kernel run call.
02192     status = clSetKernelArg(rEnv.mpkKernel,
02193         0,
02194         sizeof(cl_mem),
02195         &buffer1);
02196     status = clSetKernelArg(rEnv.mpkKernel,
02197         1,
02198         sizeof(cl_mem),
02199         &buffer2);
02200     status = clSetKernelArg(rEnv.mpkKernel,
02201             2,
02202             sizeof(wpl),
02203             (const void *)&wpl);
02204     status = clSetKernelArg(rEnv.mpkKernel,
02205             3,
02206             sizeof(h),
02207             (const void *)&h);
02208     if (outBuffer != NULL)
02209     {
02210         status = clSetKernelArg(rEnv.mpkKernel,
02211             4,
02212             sizeof(cl_mem),
02213             (const void *)&outBuffer);
02214     }
02215     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
02216                             rEnv.mpkKernel,
02217                             2,
02218                             NULL,
02219                             globalThreads,
02220                             localThreads,
02221                             0,
02222                             NULL,
02223                             NULL);
02224 
02225     return status;
02226 }
02227 
02228 // OpenCL implementation of Subtract pix
02229 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02230 PIX*
02231 OpenclDevice::pixSubtractCL(PIX  *pixd, PIX  *pixs1, PIX  *pixs2, bool reqDataCopy = false)
02232 {
02233     l_uint32 wpl, h;
02234 
02235     PROCNAME("pixSubtractCL");
02236 
02237     if (!pixs1)
02238         return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd);
02239     if (!pixs2)
02240         return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd);
02241     if (pixGetDepth(pixs1) != pixGetDepth(pixs2))
02242         return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd);
02243 
02244 #if  EQUAL_SIZE_WARNING
02245     if (!pixSizesEqual(pixs1, pixs2))
02246         L_WARNING("pixs1 and pixs2 not equal sizes", procName);
02247 #endif  /* EQUAL_SIZE_WARNING */
02248 
02249     wpl = pixGetWpl(pixs1);
02250     h = pixGetHeight(pixs1);
02251 
02252     clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
02253 
02254     if (reqDataCopy)
02255     {
02256         //Read back output data from OCL buffer to cpu
02257         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ);
02258     }
02259 
02260     return pixd;
02261 }
02262 
02263 // OpenCL implementation of Hollow pix
02264 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02265 PIX*
02266 OpenclDevice::pixHollowCL(PIX  *pixd,
02267                         PIX  *pixs,
02268                         l_int32  close_hsize,
02269                         l_int32  close_vsize,
02270                         l_int32  open_hsize,
02271                         l_int32  open_vsize,
02272                         bool reqDataCopy = false)
02273 {
02274     l_uint32 wpl, h;
02275     cl_mem pixtemp;
02276 
02277     wpl = pixGetWpl(pixs);
02278     h = pixGetHeight(pixs);
02279 
02280     //First step : Close Morph operation: Dilate followed by Erode
02281     clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
02282 
02283     //Store the output of close operation in an intermediate buffer
02284     //this will be later used for pixsubtract
02285     clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
02286 
02287     //Second step: Open Operation - Erode followed by Dilate
02288     pixtemp = pixsCLBuffer;
02289     pixsCLBuffer = pixdCLBuffer;
02290     pixdCLBuffer = pixtemp;
02291 
02292     clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
02293 
02294     //Third step: Subtract : (Close - Open)
02295     pixtemp = pixsCLBuffer;
02296     pixsCLBuffer = pixdCLBuffer;
02297     pixdCLBuffer = pixdCLIntermediate;
02298     pixdCLIntermediate = pixtemp;
02299 
02300     clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
02301 
02302     if (reqDataCopy)
02303     {
02304         //Read back output data from OCL buffer to cpu
02305         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
02306     }
02307     return pixd;
02308 }
02309 
02310 // OpenCL implementation of Get Lines from pix function
02311 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02312 void
02313 OpenclDevice::pixGetLinesCL(PIX  *pixd,
02314                             PIX  *pixs,
02315                             PIX** pix_vline,
02316                             PIX** pix_hline,
02317                             PIX** pixClosed,
02318                             bool  getpixClosed,
02319                             l_int32  close_hsize, l_int32  close_vsize,
02320                             l_int32  open_hsize, l_int32  open_vsize,
02321                             l_int32  line_hsize, l_int32  line_vsize)
02322 {
02323     l_uint32 wpl, h;
02324     cl_mem pixtemp;
02325 
02326     wpl = pixGetWpl(pixs);
02327     h = pixGetHeight(pixs);
02328 
02329     //First step : Close Morph operation: Dilate followed by Erode
02330     clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
02331 
02332     //Copy the Close output to CPU buffer
02333     if (getpixClosed)
02334     {
02335         *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false);
02336     }
02337 
02338     //Store the output of close operation in an intermediate buffer
02339     //this will be later used for pixsubtract
02340     clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
02341 
02342     //Second step: Open Operation - Erode followed by Dilate
02343     pixtemp = pixsCLBuffer;
02344     pixsCLBuffer = pixdCLBuffer;
02345     pixdCLBuffer = pixtemp;
02346 
02347     clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
02348 
02349     //Third step: Subtract : (Close - Open)
02350     pixtemp = pixsCLBuffer;
02351     pixsCLBuffer = pixdCLBuffer;
02352     pixdCLBuffer = pixdCLIntermediate;
02353     pixdCLIntermediate = pixtemp;
02354 
02355     clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
02356 
02357     //Store the output of Hollow operation in an intermediate buffer
02358     //this will be later used
02359     clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
02360 
02361     pixtemp = pixsCLBuffer;
02362     pixsCLBuffer = pixdCLBuffer;
02363     pixdCLBuffer = pixtemp;
02364 
02365     //Fourth step: Get vertical line
02366     //pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
02367     clStatus = pixOpenCL(1, line_vsize, wpl, h);
02368 
02369     //Copy the vertical line output to CPU buffer
02370     *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false);
02371 
02372     pixtemp = pixsCLBuffer;
02373     pixsCLBuffer = pixdCLIntermediate;
02374     pixdCLIntermediate = pixtemp;
02375 
02376     //Fifth step: Get horizontal line
02377     //pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
02378     clStatus = pixOpenCL(line_hsize, 1, wpl, h);
02379 
02380     //Copy the horizontal line output to CPU buffer
02381     *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true);
02382 
02383     return;
02384 }
02385 
02386 
02387 /*************************************************************************
02388  *  HistogramRect
02389  *  Otsu Thresholding Operations
02390  *  histogramAllChannels is laid out as all channel 0, then all channel 1...
02391  *  only supports 1 or 4 channels (bytes_per_pixel)
02392  ************************************************************************/
02393 int OpenclDevice::HistogramRectOCL(
02394     const unsigned char* imageData,
02395     int bytes_per_pixel,
02396     int bytes_per_line,
02397     int left, // always 0
02398     int top, // always 0
02399     int width,
02400     int height,
02401     int kHistogramSize,
02402     int* histogramAllChannels)
02403 {
02404 PERF_COUNT_START("HistogramRectOCL")
02405     cl_int clStatus;
02406     int retVal= 0;
02407     KernelEnv histKern;
02408     SetKernelEnv( &histKern );
02409     KernelEnv histRedKern;
02410     SetKernelEnv( &histRedKern );
02411     /* map imagedata to device as read only */
02412     // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
02413     // faster option would be to allocate initial image buffer
02414     // using a garlic bus memory type
02415     cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
02416     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02417 
02418     /* setup work group size parameters */
02419     int block_size = 256;
02420     cl_uint numCUs;
02421     clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
02422     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02423 
02424     int requestedOccupancy = 10;
02425     int numWorkGroups = numCUs * requestedOccupancy;
02426     int numThreads = block_size*numWorkGroups;
02427     size_t local_work_size[] = {static_cast<size_t>(block_size)};
02428     size_t global_work_size[] = {static_cast<size_t>(numThreads)};
02429     size_t red_global_work_size[] = {static_cast<size_t>(block_size*kHistogramSize*bytes_per_pixel)};
02430 
02431     /* map histogramAllChannels as write only */
02432     int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
02433 
02434     cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus );
02435     CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer");
02436 
02437     /* intermediate histogram buffer */
02438     int histRed = 256;
02439     int tmpHistogramBins =  kHistogramSize*bytes_per_pixel*histRed;
02440 
02441     cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus );
02442     CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer");
02443 
02444     /* atomic sync buffer */
02445     int *zeroBuffer = new int[1];
02446     zeroBuffer[0] = 0;
02447     cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
02448     CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
02449     delete[] zeroBuffer;
02450     //Create kernel objects based on bytes_per_pixel
02451     if (bytes_per_pixel == 1)
02452     {
02453         histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus );
02454         CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
02455 
02456         histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus );
02457         CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
02458     } else {
02459     histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
02460     CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
02461 
02462     histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
02463     CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
02464     }
02465 
02466     void *ptr;
02467 
02468     //Initialize tmpHistogramBuffer buffer
02469     ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus);
02470     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
02471 
02472     memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
02473     clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL);
02474 
02475     /* set kernel 1 arguments */
02476     clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
02477     CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
02478     cl_uint numPixels = width*height;
02479     clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels );
02480     CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
02481     clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
02482     CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
02483 
02484     /* set kernel 2 arguments */
02485     int n = numThreads/bytes_per_pixel;
02486     clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n );
02487     CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
02488     clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
02489     CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
02490     clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer );
02491     CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
02492 
02493     /* launch histogram */
02494 PERF_COUNT_SUB("before")
02495     clStatus = clEnqueueNDRangeKernel(
02496         histKern.mpkCmdQueue,
02497         histKern.mpkKernel,
02498         1, NULL, global_work_size, local_work_size,
02499         0, NULL, NULL );
02500     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
02501     clFinish( histKern.mpkCmdQueue );
02502     if(clStatus !=0)
02503     {
02504                 retVal = -1;
02505         }
02506     /* launch histogram */
02507     clStatus = clEnqueueNDRangeKernel(
02508         histRedKern.mpkCmdQueue,
02509         histRedKern.mpkKernel,
02510         1, NULL, red_global_work_size, local_work_size,
02511         0, NULL, NULL );
02512     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
02513     clFinish( histRedKern.mpkCmdQueue );
02514         if(clStatus !=0)
02515         {
02516                         retVal = -1;
02517         }
02518 PERF_COUNT_SUB("redKernel")
02519 
02520     /* map results back from gpu */
02521     ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
02522     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
02523     if(clStatus !=0)
02524     {
02525                                 retVal = -1;
02526         }
02527     clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);
02528 
02529     clReleaseMemObject(histogramBuffer);
02530     clReleaseMemObject(imageBuffer);
02531 PERF_COUNT_SUB("after")
02532 PERF_COUNT_END
02533    return retVal;
02534 
02535 }
02536 
02537 /*************************************************************************
02538  * Threshold the rectangle, taking everything except the image buffer pointer
02539  * from the class, using thresholds/hi_values to the output IMAGE.
02540  * only supports 1 or 4 channels
02541  ************************************************************************/
02542 int OpenclDevice::ThresholdRectToPixOCL(
02543     const unsigned char* imageData,
02544     int bytes_per_pixel,
02545     int bytes_per_line,
02546     const int* thresholds,
02547     const int* hi_values,
02548     Pix** pix,
02549     int height,
02550     int width,
02551     int top,
02552     int left) {
02553 PERF_COUNT_START("ThresholdRectToPixOCL")
02554     int retVal =0;
02555     /* create pix result buffer */
02556     *pix = pixCreate(width, height, 1);
02557     uinT32* pixData = pixGetData(*pix);
02558     int wpl = pixGetWpl(*pix);
02559     int pixSize = wpl*height*sizeof(uinT32); // number of pixels
02560 
02561     cl_int clStatus;
02562     KernelEnv rEnv;
02563     SetKernelEnv( &rEnv );
02564 
02565     /* setup work group size parameters */
02566     int block_size = 256;
02567     cl_uint numCUs = 6;
02568      clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
02569     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02570 
02571     int requestedOccupancy = 10;
02572     int numWorkGroups = numCUs * requestedOccupancy;
02573     int numThreads = block_size*numWorkGroups;
02574     size_t local_work_size[] = {(size_t) block_size};
02575     size_t global_work_size[] = {(size_t) numThreads};
02576 
02577     /* map imagedata to device as read only */
02578     // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
02579     // faster option would be to allocate initial image buffer
02580     // using a garlic bus memory type
02581     cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
02582     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02583 
02584     /* map pix as write only */
02585     pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus );
02586     CHECK_OPENCL( clStatus, "clCreateBuffer pix");
02587 
02588     /* map thresholds and hi_values */
02589     cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus );
02590     CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer");
02591     cl_mem hiValuesBuffer   = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus );
02592     CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer");
02593 
02594     /* compile kernel */
02595     if (bytes_per_pixel == 4) {
02596         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus );
02597         CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix");
02598     } else {
02599         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus );
02600         CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
02601     }
02602 
02603     /* set kernel arguments */
02604     clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
02605     CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
02606     cl_uint numPixels = width*height;
02607     clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height );
02608     CHECK_OPENCL( clStatus, "clSetKernelArg height" );
02609     clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width );
02610     CHECK_OPENCL( clStatus, "clSetKernelArg width" );
02611     clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl );
02612     CHECK_OPENCL( clStatus, "clSetKernelArg wpl" );
02613     clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer );
02614     CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" );
02615     clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer );
02616     CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" );
02617     clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer );
02618     CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer");
02619 
02620     /* launch kernel & wait */
02621 PERF_COUNT_SUB("before")
02622     clStatus = clEnqueueNDRangeKernel(
02623         rEnv.mpkCmdQueue,
02624         rEnv.mpkKernel,
02625         1, NULL, global_work_size, local_work_size,
02626         0, NULL, NULL );
02627     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
02628     clFinish( rEnv.mpkCmdQueue );
02629 PERF_COUNT_SUB("kernel")
02630         if(clStatus !=0)
02631                 {
02632                                 printf("Setting return value to -1\n");
02633                                 retVal = -1;
02634         }
02635     /* map results back from gpu */
02636     void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
02637     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
02638     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL);
02639 
02640     clReleaseMemObject(imageBuffer);
02641     clReleaseMemObject(thresholdsBuffer);
02642     clReleaseMemObject(hiValuesBuffer);
02643 
02644 PERF_COUNT_SUB("after")
02645 PERF_COUNT_END
02646 return retVal;
02647 }
02648 
02649 
02650 
02651 /******************************************************************************
02652  * Data Types for Device Selection
02653  *****************************************************************************/
02654 
02655 typedef struct _TessScoreEvaluationInputData {
02656     int height;
02657     int width;
02658     int numChannels;
02659     unsigned char *imageData;
02660     Pix *pix;
02661 } TessScoreEvaluationInputData;
02662 
02663 void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) {
02664     srand(1);
02665     // 8.5x11 inches @ 300dpi rounded to clean multiples
02666     int height = 3328; // %256
02667     int width = 2560; // %512
02668     int numChannels = 4;
02669     input->height = height;
02670     input->width = width;
02671     input->numChannels = numChannels;
02672     unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
02673     input->imageData = (unsigned char *) &imageData4[0];
02674 
02675     // zero out image
02676     unsigned char pixelWhite[4] = {  0,   0,   0, 255};
02677     unsigned char pixelBlack[4] = {255, 255, 255, 255};
02678     for (int p = 0; p < height*width; p++) {
02679         //unsigned char tmp[4] = imageData4[0];
02680         imageData4[p][0] = pixelWhite[0];
02681         imageData4[p][1] = pixelWhite[1];
02682         imageData4[p][2] = pixelWhite[2];
02683         imageData4[p][3] = pixelWhite[3];
02684     }
02685     // random lines to be eliminated
02686     int maxLineWidth = 64; // pixels wide
02687     int numLines = 10;
02688     // vertical lines
02689     for (int i = 0; i < numLines; i++) {
02690         int lineWidth = rand()%maxLineWidth;
02691         int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
02692         //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
02693         for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
02694             for (int col = 0; col < height; col++) {
02695                 //imageData4[row*width+col] = pixelBlack;
02696                 imageData4[row*width+col][0] = pixelBlack[0];
02697                 imageData4[row*width+col][1] = pixelBlack[1];
02698                 imageData4[row*width+col][2] = pixelBlack[2];
02699                 imageData4[row*width+col][3] = pixelBlack[3];
02700             }
02701         }
02702     }
02703     // horizontal lines
02704     for (int i = 0; i < numLines; i++) {
02705         int lineWidth = rand()%maxLineWidth;
02706         int horLinePos = lineWidth + rand()%(height-2*lineWidth);
02707         //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
02708         for (int row = 0; row < width; row++) {
02709             for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
02710                 //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
02711                 //imageData4[row*width+col] = pixelBlack;
02712                 imageData4[row*width+col][0] = pixelBlack[0];
02713                 imageData4[row*width+col][1] = pixelBlack[1];
02714                 imageData4[row*width+col][2] = pixelBlack[2];
02715                 imageData4[row*width+col][3] = pixelBlack[3];
02716             }
02717         }
02718     }
02719     // spots (noise, squares)
02720     float fractionBlack = 0.1; // how much of the image should be blackened
02721     int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
02722     for (int i = 0; i < numSpots; i++) {
02723 
02724         int lineWidth = rand()%maxLineWidth;
02725         int col = lineWidth + rand()%(width-2*lineWidth);
02726         int row = lineWidth + rand()%(height-2*lineWidth);
02727         //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
02728         for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
02729             for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
02730                 //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
02731                 //imageData4[row*width+col] = pixelBlack;
02732                 imageData4[r*width+c][0] = pixelBlack[0];
02733                 imageData4[r*width+c][1] = pixelBlack[1];
02734                 imageData4[r*width+c][2] = pixelBlack[2];
02735                 imageData4[r*width+c][3] = pixelBlack[3];
02736             }
02737         }
02738     }
02739 
02740     input->pix = pixCreate(input->width, input->height, 1);
02741 }
02742 
02743 typedef struct _TessDeviceScore {
02744     float time; // small time means faster device
02745     bool clError; // were there any opencl errors
02746     bool valid; // was the correct response generated
02747 } TessDeviceScore;
02748 
02749 /******************************************************************************
02750  * Micro Benchmarks for Device Selection
02751  *****************************************************************************/
02752 
02753 double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
02754 
02755     double time = 0;
02756 #if ON_WINDOWS
02757     LARGE_INTEGER freq, time_funct_start, time_funct_end;
02758     QueryPerformanceFrequency(&freq);
02759 #elif ON_APPLE
02760         mach_timebase_info_data_t info = { 0, 0 };
02761     mach_timebase_info(&info);
02762         long long start,stop;
02763 #else
02764     timespec time_funct_start, time_funct_end;
02765 #endif
02766     // input data
02767     l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
02768 
02769     // function call
02770     if (type == DS_DEVICE_OPENCL_DEVICE) {
02771 #if ON_WINDOWS
02772         QueryPerformanceCounter(&time_funct_start);
02773 #elif  ON_APPLE
02774         start = mach_absolute_time();
02775 #else
02776         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02777 #endif
02778 
02779         OpenclDevice::gpuEnv = *env;
02780         int wpl = pixGetWpl(input.pix);
02781         OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL);
02782 #if ON_WINDOWS
02783         QueryPerformanceCounter(&time_funct_end);
02784         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02785 #elif  ON_APPLE
02786                 stop = mach_absolute_time();
02787                 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
02788 #else
02789         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02790         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02791 #endif
02792 
02793     } else {
02794 #if ON_WINDOWS
02795         QueryPerformanceCounter(&time_funct_start);
02796 #elif  ON_APPLE
02797                 start = mach_absolute_time();
02798 #else
02799         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02800 #endif
02801         Pix *pix = pixCreate(input.width, input.height, 32);
02802         l_uint32 *pixData = pixGetData(pix);
02803         int wpl = pixGetWpl(pix);
02804         //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
02805         //pixSetData(pix, output_gpu);
02806         int i, j;
02807         int idx = 0;
02808         for (i = 0; i < input.height ; i++) {
02809             for (j = 0; j < input.width; j++) {
02810 
02811                 l_uint32 tiffword = tiffdata[i * input.width + j];
02812                 l_int32 rval = ((tiffword) & 0xff);
02813                 l_int32 gval = (((tiffword) >> 8) & 0xff);
02814                 l_int32 bval = (((tiffword) >> 16) & 0xff);
02815                 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
02816                 pixData[idx] = value;
02817                 idx++;
02818             }
02819         }
02820 #if ON_WINDOWS
02821         QueryPerformanceCounter(&time_funct_end);
02822         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02823 #elif  ON_APPLE
02824                 stop = mach_absolute_time();
02825                 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
02826 #else
02827         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02828         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02829 #endif
02830         pixDestroy(&pix);
02831     }
02832 
02833 
02834     // cleanup
02835 
02836     return time;
02837 }
02838 
02839 double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
02840 
02841     double time;
02842 #if ON_WINDOWS
02843     LARGE_INTEGER freq, time_funct_start, time_funct_end;
02844     QueryPerformanceFrequency(&freq);
02845 #elif  ON_APPLE
02846         mach_timebase_info_data_t info = { 0, 0 };
02847     mach_timebase_info(&info);
02848         long long start,stop;
02849 #else
02850     timespec time_funct_start, time_funct_end;
02851 #endif
02852 
02853     unsigned char pixelHi = (unsigned char)255;
02854 
02855     int left = 0;
02856     int top = 0;
02857     int kHistogramSize = 256;
02858     int bytes_per_line = input.width*input.numChannels;
02859     int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
02860     int retVal= 0;
02861     // function call
02862     if (type == DS_DEVICE_OPENCL_DEVICE) {
02863 #if ON_WINDOWS
02864         QueryPerformanceCounter(&time_funct_start);
02865 #elif  ON_APPLE
02866                 start = mach_absolute_time();
02867 #else
02868         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02869 #endif
02870 
02871         OpenclDevice::gpuEnv = *env;
02872         int wpl = pixGetWpl(input.pix);
02873         retVal= OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
02874 
02875 #if ON_WINDOWS
02876         QueryPerformanceCounter(&time_funct_end);
02877         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02878 #elif  ON_APPLE
02879                 stop = mach_absolute_time();
02880                 if(retVal ==0)
02881                 {
02882                         time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
02883                 }
02884                 else
02885                 {
02886                         time= FLT_MAX;
02887                 }
02888 #else
02889         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02890         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02891 #endif
02892     } else {
02893 
02894         int *histogram = new int[kHistogramSize];
02895 #if ON_WINDOWS
02896         QueryPerformanceCounter(&time_funct_start);
02897 #elif  ON_APPLE
02898                 start = mach_absolute_time();
02899 #else
02900         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02901 #endif
02902         for (int ch = 0; ch < input.numChannels; ++ch) {
02903             tesseract::HistogramRect(input.pix, input.numChannels,
02904                   left, top, input.width, input.height, histogram);
02905         }
02906 #if ON_WINDOWS
02907         QueryPerformanceCounter(&time_funct_end);
02908         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02909 #elif  ON_APPLE
02910                 stop = mach_absolute_time();
02911                 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
02912 #else
02913         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02914         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02915 #endif
02916         delete[] histogram;
02917     }
02918 
02919     // cleanup
02920     delete[] histogramAllChannels;
02921     return time;
02922 }
02923 
02924 //Reproducing the ThresholdRectToPix native version
02925 void ThresholdRectToPix_Native(const unsigned char* imagedata,
02926                                           int bytes_per_pixel,
02927                                           int bytes_per_line,
02928                                           const int* thresholds,
02929                                           const int* hi_values,
02930                                           Pix** pix) {
02931     int top = 0;
02932     int left = 0;
02933     int width = pixGetWidth(*pix);
02934     int height = pixGetHeight(*pix);
02935 
02936   *pix = pixCreate(width, height, 1);
02937   uinT32* pixdata = pixGetData(*pix);
02938   int wpl = pixGetWpl(*pix);
02939   const unsigned char* srcdata = imagedata + top * bytes_per_line +
02940                                  left * bytes_per_pixel;
02941   for (int y = 0; y < height; ++y) {
02942     const uinT8* linedata = srcdata;
02943     uinT32* pixline = pixdata + y * wpl;
02944     for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
02945       bool white_result = true;
02946       for (int ch = 0; ch < bytes_per_pixel; ++ch) {
02947         if (hi_values[ch] >= 0 &&
02948             (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
02949           white_result = false;
02950           break;
02951         }
02952       }
02953       if (white_result)
02954         CLEAR_DATA_BIT(pixline, x);
02955       else
02956         SET_DATA_BIT(pixline, x);
02957     }
02958     srcdata += bytes_per_line;
02959   }
02960 }
02961 
02962 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
02963 
02964     double time;
02965     int retVal =0;
02966 #if ON_WINDOWS
02967     LARGE_INTEGER freq, time_funct_start, time_funct_end;
02968     QueryPerformanceFrequency(&freq);
02969 #elif  ON_APPLE
02970         mach_timebase_info_data_t info = { 0, 0 };
02971     mach_timebase_info(&info);
02972         long long start,stop;
02973 #else
02974     timespec time_funct_start, time_funct_end;
02975 #endif
02976 
02977     // input data
02978     unsigned char pixelHi = (unsigned char)255;
02979     int* thresholds = new int[4];
02980     thresholds[0] = pixelHi/2;
02981     thresholds[1] = pixelHi/2;
02982     thresholds[2] = pixelHi/2;
02983     thresholds[3] = pixelHi/2;
02984     int *hi_values = new int[4];
02985     thresholds[0] = pixelHi;
02986     thresholds[1] = pixelHi;
02987     thresholds[2] = pixelHi;
02988     thresholds[3] = pixelHi;
02989     //Pix* pix = pixCreate(width, height, 1);
02990     int top = 0;
02991     int left = 0;
02992     int bytes_per_line = input.width*input.numChannels;
02993 
02994     // function call
02995     if (type == DS_DEVICE_OPENCL_DEVICE) {
02996 #if ON_WINDOWS
02997         QueryPerformanceCounter(&time_funct_start);
02998 #elif  ON_APPLE
02999                 start = mach_absolute_time();
03000 #else
03001         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
03002 #endif
03003 
03004         OpenclDevice::gpuEnv = *env;
03005         int wpl = pixGetWpl(input.pix);
03006         retVal= OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
03007 
03008 #if ON_WINDOWS
03009         QueryPerformanceCounter(&time_funct_end);
03010         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
03011 #elif  ON_APPLE
03012                 stop = mach_absolute_time();
03013                 if(retVal ==0)
03014                 {
03015                         time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;;
03016                 }
03017                 else
03018                 {
03019                         time= FLT_MAX;
03020                 }
03021 
03022 #else
03023         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
03024         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
03025 #endif
03026     } else {
03027 
03028 
03029         tesseract::ImageThresholder thresholder;
03030         thresholder.SetImage( input.pix );
03031 #if ON_WINDOWS
03032         QueryPerformanceCounter(&time_funct_start);
03033 #elif  ON_APPLE
03034                 start = mach_absolute_time();
03035 #else
03036         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
03037 #endif
03038         ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
03039             thresholds, hi_values, &input.pix );
03040 
03041 #if ON_WINDOWS
03042         QueryPerformanceCounter(&time_funct_end);
03043         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
03044 #elif  ON_APPLE
03045                 stop = mach_absolute_time();
03046                 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
03047 #else
03048         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
03049         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
03050 #endif
03051     }
03052 
03053     // cleanup
03054     delete[] thresholds;
03055     delete[] hi_values;
03056     return time;
03057 }
03058 
03059 double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
03060 
03061     double time = 0;
03062 #if ON_WINDOWS
03063     LARGE_INTEGER freq, time_funct_start, time_funct_end;
03064     QueryPerformanceFrequency(&freq);
03065 #elif  ON_APPLE
03066         mach_timebase_info_data_t info = { 0, 0 };
03067     mach_timebase_info(&info);
03068         long long start,stop;
03069 #else
03070     timespec time_funct_start, time_funct_end;
03071 #endif
03072 
03073     // input data
03074     int resolution = 300;
03075     int wpl = pixGetWpl(input.pix);
03076     int kThinLineFraction = 20; // tess constant
03077     int kMinLineLengthFraction = 4; // tess constant
03078     int max_line_width = resolution / kThinLineFraction;
03079     int min_line_length = resolution / kMinLineLengthFraction;
03080     int closing_brick = max_line_width / 3;
03081 
03082     // function call
03083     if (type == DS_DEVICE_OPENCL_DEVICE) {
03084 #if ON_WINDOWS
03085         QueryPerformanceCounter(&time_funct_start);
03086 #elif  ON_APPLE
03087                 start = mach_absolute_time();
03088 #else
03089         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
03090 #endif
03091         Pix *src_pix = input.pix;
03092         OpenclDevice::gpuEnv = *env;
03093         OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
03094         Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
03095         OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length);
03096 
03097         OpenclDevice::releaseMorphCLBuffers();
03098 
03099 #if ON_WINDOWS
03100         QueryPerformanceCounter(&time_funct_end);
03101         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
03102 #elif  ON_APPLE
03103                 stop = mach_absolute_time();
03104                 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
03105 #else
03106         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
03107         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
03108 #endif
03109     } else {
03110 #if ON_WINDOWS
03111         QueryPerformanceCounter(&time_funct_start);
03112 #elif  ON_APPLE
03113                 start = mach_absolute_time();
03114 #else
03115         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
03116 #endif
03117 
03118         // native serial code
03119         Pix *src_pix = input.pix;
03120         Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
03121         Pix *pix_solid  = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
03122         Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
03123         pixDestroy(&pix_solid);
03124         Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
03125         Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
03126         pixDestroy(&pix_hollow);
03127 
03128 #if ON_WINDOWS
03129         QueryPerformanceCounter(&time_funct_end);
03130         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
03131 #elif  ON_APPLE
03132                 stop = mach_absolute_time();
03133                 time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
03134 #else
03135         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
03136         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
03137 #endif
03138     }
03139 
03140     return time;
03141 }
03142 
03143 
03144 
03145 /******************************************************************************
03146  * Device Selection
03147  *****************************************************************************/
03148 
03149 #include "stdlib.h"
03150 
03151 
03152 // encode score object as byte string
03153 ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
03154     *serializedScoreSize = sizeof(TessDeviceScore);
03155     *serializedScore = (void *) new unsigned char[*serializedScoreSize];
03156     memcpy(*serializedScore, device->score, *serializedScoreSize);
03157     return DS_SUCCESS;
03158 }
03159 
03160 // parses byte string and stores in score object
03161 ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
03162     // check that serializedScoreSize == sizeof(TessDeviceScore);
03163     device->score = new TessDeviceScore;
03164     memcpy(device->score, serializedScore, serializedScoreSize);
03165     return DS_SUCCESS;
03166 }
03167 
03168 ds_status releaseScore( void* score ) {
03169   delete[] score;
03170   return DS_SUCCESS;
03171 }
03172 
03173 // evaluate devices
03174 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
03175 
03176     // overwrite statuc gpuEnv w/ current device
03177     // so native opencl calls can be used; they use static gpuEnv
03178     printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
03179     GPUEnv *env = NULL;
03180     if (device->type == DS_DEVICE_OPENCL_DEVICE) {
03181         env = new GPUEnv;
03182         //printf("[DS] populating tmp GPUEnv from device\n");
03183         populateGPUEnvFromDevice( env, device->oclDeviceID);
03184         env->mnFileCount = 0; //argc;
03185         env->mnKernelCount = 0UL;
03186         //printf("[DS] compiling kernels for tmp GPUEnv\n");
03187         OpenclDevice::gpuEnv = *env;
03188         OpenclDevice::CompileKernelFile(env, "");
03189     }
03190 
03191     TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
03192 
03193     // pixReadTiff
03194     double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
03195 
03196     // HistogramRect
03197     double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
03198 
03199     // ThresholdRectToPix
03200     double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
03201 
03202     // getLineMasks
03203     double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
03204 
03205 
03206     // weigh times (% of cpu time)
03207     // these weights should be the % execution time that the native cpu code took
03208     float composeRGBPixelWeight     = 1.2f;
03209     float histogramRectWeight       = 2.4f;
03210     float thresholdRectToPixWeight  = 4.5f;
03211     float getLineMasksMorphWeight   = 5.0f;
03212 
03213     float weightedTime =
03214         composeRGBPixelWeight       * composeRGBPixelTime +
03215         histogramRectWeight         * histogramRectTime +
03216         thresholdRectToPixWeight    * thresholdRectToPixTime +
03217         getLineMasksMorphWeight     * getLineMasksMorphTime
03218         ;
03219     device->score = (void *)new TessDeviceScore;
03220     ((TessDeviceScore *)device->score)->time = weightedTime;
03221 
03222     printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
03223     printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
03224     printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
03225     printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
03226     printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
03227     printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
03228     return DS_SUCCESS;
03229 }
03230 
03231 // initial call to select device
03232 ds_device OpenclDevice::getDeviceSelection( ) {
03233   if (!deviceIsSelected) {
03234 PERF_COUNT_START("getDeviceSelection")
03235   // check if opencl is available at runtime
03236   if( 1 == LoadOpencl() ) {
03237     // opencl is available
03238 //PERF_COUNT_SUB("LoadOpencl")
03239     // setup devices
03240     ds_status status;
03241     ds_profile *profile;
03242     status = initDSProfile( &profile, "v0.1" );
03243 PERF_COUNT_SUB("initDSProfile")
03244     // try reading scores from file
03245     char *fileName = "tesseract_opencl_profile_devices.dat";
03246     status = readProfileFromFile( profile, deserializeScore, fileName);
03247     if (status != DS_SUCCESS) {
03248       // need to run evaluation
03249       printf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
03250 
03251       // create input data
03252       TessScoreEvaluationInputData input;
03253       populateTessScoreEvaluationInputData( &input );
03254 //PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
03255       // perform evaluations
03256       unsigned int numUpdates;
03257       status =  profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates );
03258 PERF_COUNT_SUB("profileDevices")
03259       // write scores to file
03260       if ( status == DS_SUCCESS ) {
03261         status = writeProfileToFile( profile, serializeScore, fileName);
03262 PERF_COUNT_SUB("writeProfileToFile")
03263         if ( status == DS_SUCCESS ) {
03264           printf("[DS] Scores written to file (%s).\n", fileName);
03265         } else {
03266           printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName);
03267         }
03268       } else {
03269         printf("[DS] Unable to evaluate performance; scores not written to file.\n");
03270       }
03271     } else {
03272 
03273 PERF_COUNT_SUB("readProfileFromFile")
03274                 printf("[DS] Profile read from file (%s).\n", fileName);
03275     }
03276 
03277     // we now have device scores either from file or evaluation
03278     // select fastest using custom Tesseract selection algorithm
03279     float bestTime = FLT_MAX; // begin search with worst possible time
03280     int bestDeviceIdx = -1;
03281     for (int d = 0; d < profile->numDevices; d++) {
03282       ds_device device = profile->devices[d];
03283       TessDeviceScore score = *(TessDeviceScore *)device.score;
03284 
03285       float time = score.time;
03286               printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time);
03287       if (time < bestTime) {
03288                   bestTime = time;
03289           bestDeviceIdx = d;
03290       }
03291     }
03292     printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
03293     // cleanup
03294     // TODO: call destructor for profile object?
03295 
03296       bool overrided = false;
03297       char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
03298       if (overrideDeviceStr != NULL) {
03299         int overrideDeviceIdx = atoi(overrideDeviceStr);
03300         if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) {
03301           printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx);
03302           bestDeviceIdx = overrideDeviceIdx - 1;
03303           overrided = true;
03304         } else {
03305           printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices);
03306         }
03307       }
03308 
03309       if (overrided) {
03310         printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
03311       }
03312       selectedDevice = profile->devices[bestDeviceIdx];
03313       // cleanup
03314       releaseDSProfile(profile, releaseScore);
03315     } else {
03316       // opencl isn't available at runtime, select native cpu device
03317       printf("[DS] OpenCL runtime not available.\n");
03318       selectedDevice.type = DS_DEVICE_NATIVE_CPU;
03319       selectedDevice.oclDeviceName = "(null)";
03320       selectedDevice.score = NULL;
03321       selectedDevice.oclDeviceID = NULL;
03322       selectedDevice.oclDriverVersion = NULL;
03323     }
03324     deviceIsSelected = true;
03325 PERF_COUNT_SUB("select from Profile")
03326 PERF_COUNT_END
03327   }
03328 //PERF_COUNT_END
03329   return selectedDevice;
03330 }
03331 
03332 
03333 bool OpenclDevice::selectedDeviceIsOpenCL() {
03334   ds_device device = getDeviceSelection();
03335   return (device.type == DS_DEVICE_OPENCL_DEVICE);
03336 }
03337 
03338 bool OpenclDevice::selectedDeviceIsNativeCPU() {
03339   ds_device device = getDeviceSelection();
03340   return (device.type == DS_DEVICE_NATIVE_CPU);
03341 }
03342 
03343 
03344 
03356 #define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))
03357 
03358 Pix * OpenclDevice::pixConvertRGBToGrayOCL(
03359         Pix *srcPix, // 32-bit source
03360         float rwt,
03361         float gwt,
03362         float bwt )
03363 {
03364 PERF_COUNT_START("pixConvertRGBToGrayOCL")
03365         Pix *dstPix; // 8-bit destination
03366 
03367         if (rwt < 0.0 || gwt < 0.0 || bwt < 0.0) return NULL;
03368 
03369         if (rwt == 0.0 && gwt == 0.0 && bwt == 0.0) {
03370                 // magic numbers from leptonica
03371             rwt = 0.3;
03372             gwt = 0.5;
03373             bwt = 0.2;
03374         }
03375         // normalize
03376         float sum = rwt + gwt + bwt;
03377         rwt /= sum;
03378         gwt /= sum;
03379         bwt /= sum;
03380 
03381         // source pix
03382         int w, h;
03383         pixGetDimensions(srcPix, &w, &h, NULL);
03384     //printf("Image is %i x %i\n", w, h);
03385     unsigned int *srcData = pixGetData(srcPix);
03386     int srcWPL = pixGetWpl(srcPix);
03387         int srcSize = srcWPL * h * sizeof(unsigned int);
03388 
03389         // destination pix
03390     if ((dstPix = pixCreate(w, h, 8)) == NULL)
03391         return NULL;
03392     pixCopyResolution(dstPix, srcPix);
03393     unsigned int *dstData = pixGetData(dstPix);
03394     int dstWPL = pixGetWpl(dstPix);
03395         int dstWords = dstWPL * h;
03396         int dstSize = dstWords * sizeof(unsigned int);
03397     //printf("dstSize = %i\n", dstSize);
03398 PERF_COUNT_SUB("pix setup")
03399 
03400         // opencl objects
03401         cl_int clStatus;
03402     KernelEnv kEnv;
03403     SetKernelEnv( &kEnv );
03404 
03405         // source buffer
03406         cl_mem srcBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, srcSize, (void *)srcData, &clStatus );
03407     CHECK_OPENCL( clStatus, "clCreateBuffer srcBuffer");
03408 
03409     // destination buffer
03410     cl_mem dstBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dstSize, (void *)dstData, &clStatus );
03411     CHECK_OPENCL( clStatus, "clCreateBuffer dstBuffer");
03412 
03413     // setup work group size parameters
03414     int block_size = 256;
03415     int numWorkGroups = ((h*w+block_size-1) / block_size );
03416     int numThreads = block_size*numWorkGroups;
03417     size_t local_work_size[] = {static_cast<size_t>(block_size)};
03418     size_t global_work_size[] = {static_cast<size_t>(numThreads)};
03419     //printf("Enqueueing %i threads for %i output pixels\n", numThreads, w*h);
03420 
03421     /* compile kernel */
03422     kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, "kernel_RGBToGray", &clStatus );
03423     CHECK_OPENCL( clStatus, "clCreateKernel kernel_RGBToGray");
03424 
03425 
03426     /* set kernel arguments */
03427     clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&srcBuffer );
03428     CHECK_OPENCL( clStatus, "clSetKernelArg srcBuffer");
03429         clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&dstBuffer );
03430     CHECK_OPENCL( clStatus, "clSetKernelArg dstBuffer");
03431     clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(int), (void *)&srcWPL );
03432     CHECK_OPENCL( clStatus, "clSetKernelArg srcWPL" );
03433     clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(int), (void *)&dstWPL );
03434     CHECK_OPENCL( clStatus, "clSetKernelArg dstWPL" );
03435     clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(int), (void *)&h );
03436     CHECK_OPENCL( clStatus, "clSetKernelArg height" );
03437     clStatus = clSetKernelArg( kEnv.mpkKernel, 5, sizeof(int), (void *)&w );
03438     CHECK_OPENCL( clStatus, "clSetKernelArg width" );
03439     clStatus = clSetKernelArg( kEnv.mpkKernel, 6, sizeof(float), (void *)&rwt );
03440     CHECK_OPENCL( clStatus, "clSetKernelArg rwt" );
03441     clStatus = clSetKernelArg( kEnv.mpkKernel, 7, sizeof(float), (void *)&gwt );
03442     CHECK_OPENCL( clStatus, "clSetKernelArg gwt");
03443     clStatus = clSetKernelArg( kEnv.mpkKernel, 8, sizeof(float), (void *)&bwt );
03444     CHECK_OPENCL( clStatus, "clSetKernelArg bwt");
03445 
03446     /* launch kernel & wait */
03447 PERF_COUNT_SUB("before")
03448     clStatus = clEnqueueNDRangeKernel(
03449         kEnv.mpkCmdQueue,
03450         kEnv.mpkKernel,
03451         1, NULL, global_work_size, local_work_size,
03452         0, NULL, NULL );
03453     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_RGBToGray" );
03454     clFinish( kEnv.mpkCmdQueue );
03455 PERF_COUNT_SUB("kernel")
03456 
03457     /* map results back from gpu */
03458     void *ptr = clEnqueueMapBuffer(kEnv.mpkCmdQueue, dstBuffer, CL_TRUE, CL_MAP_READ, 0, dstSize, 0, NULL, NULL, &clStatus);
03459     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer dstBuffer");
03460     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, dstBuffer, ptr, 0, NULL, NULL);
03461 
03462 #if 0
03463     // validate: compute on cpu
03464     Pix *cpuPix = pixCreate(w, h, 8);
03465     pixCopyResolution(cpuPix, srcPix);
03466     unsigned int *cpuData = pixGetData(cpuPix);
03467     int cpuWPL = pixGetWpl(cpuPix);
03468     unsigned int *cpuLine, *srcLine;
03469     int i, j;
03470     for (i = 0, srcLine = srcData, cpuLine = cpuData; i < h; i++) {
03471         for (j = 0; j < w; j++) {
03472             unsigned int word = *(srcLine + j);
03473             int val = (l_int32)(rwt * ((word >> L_RED_SHIFT) & 0xff) +
03474                             gwt * ((word >> L_GREEN_SHIFT) & 0xff) +
03475                             bwt * ((word >> L_BLUE_SHIFT) & 0xff) + 0.5);
03476             SET_DATA_BYTE(cpuLine, j, val);
03477         }
03478         srcLine += srcWPL;
03479         cpuLine += cpuWPL;
03480     }
03481 
03482     // validate: compare
03483     printf("converted 32-bit -> 8-bit image\n");
03484     for (int row = 0; row < h; row++) {
03485         for (int col = 0; col < w; col++) {
03486             int idx = row*w + col;
03487             unsigned int srcVal = srcData[idx];
03488             unsigned char cpuVal = ((unsigned char *)cpuData)[idx];
03489             unsigned char oclVal = ((unsigned char *)dstData)[idx];
03490             if (srcVal > 0) {
03491                 printf("%4i,%4i: %u, %u, %u\n", row, col, srcVal, cpuVal, oclVal);
03492             }
03493         }
03494         //printf("\n");
03495     }
03496 #endif
03497     // release opencl objects
03498     clReleaseMemObject(srcBuffer);
03499     clReleaseMemObject(dstBuffer);
03500 
03501 
03502 PERF_COUNT_END
03503         // success
03504         return dstPix;
03505 }
03506 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines