tesseract  3.04.01
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
openclwrapper.cpp
Go to the documentation of this file.
1 #ifdef _WIN32
2 #include <windows.h>
3 #include <io.h>
4 
5 #else
6 #include <sys/types.h>
7 #include <unistd.h>
8 #endif
9 #include <float.h>
10 
11 #include "openclwrapper.h"
12 #include "oclkernels.h"
13 
14 // for micro-benchmark
15 #include "otsuthr.h"
16 #include "thresholder.h"
17 
18 #if ON_APPLE
19 #include <stdio.h>
20 #include <mach/mach_time.h>
21 #endif
22 
23 /*
24  Convenience macro to test the version of Leptonica.
25 */
26 #if defined(LIBLEPT_MAJOR_VERSION) && defined(LIBLEPT_MINOR_VERSION)
27 # define TESSERACT_LIBLEPT_PREREQ(maj, min) \
28  ((LIBLEPT_MAJOR_VERSION) > (maj) || ((LIBLEPT_MAJOR_VERSION) == (maj) && (LIBLEPT_MINOR_VERSION) >= (min)))
29 #else
30 # define TESSERACT_LIBLEPT_PREREQ(maj, min) 0
31 #endif
32 
33 #if TESSERACT_LIBLEPT_PREREQ(1,73)
34 # define CALLOC LEPT_CALLOC
35 # define FREE LEPT_FREE
36 #endif
37 
38 #ifdef USE_OPENCL
39 
41 GPUEnv OpenclDevice::gpuEnv;
42 
43 
44 bool OpenclDevice::deviceIsSelected = false;
45 ds_device OpenclDevice::selectedDevice;
46 
47 
48 int OpenclDevice::isInited = 0;
49 
50 struct tiff_transform {
51  int vflip; /* if non-zero, image needs a vertical fip */
52  int hflip; /* if non-zero, image needs a horizontal flip */
53  int rotate; /* -1 -> counterclockwise 90-degree rotation,
54  0 -> no rotation
55  1 -> clockwise 90-degree rotation */
56 };
57 
58 static struct tiff_transform tiff_orientation_transforms[] = {
59  {0, 0, 0},
60  {0, 1, 0},
61  {1, 1, 0},
62  {1, 0, 0},
63  {0, 1, -1},
64  {0, 0, 1},
65  {0, 1, 1},
66  {0, 0, -1}
67 };
68 
69 static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000;
70 
71 cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
72 cl_mem pixThBuffer; //output from thresholdtopix calculation
73 cl_int clStatus;
74 KernelEnv rEnv;
75 
76 // substitute invalid characters in device name with _
77 void legalizeFileName( char *fileName) {
78  //printf("fileName: %s\n", fileName);
79  const char* invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
80  // for each invalid char
81  for (int i = 0; i < strlen(invalidChars); i++) {
82  char invalidStr[4];
83  invalidStr[0] = invalidChars[i];
84  invalidStr[1] = NULL;
85  //printf("eliminating %s\n", invalidStr);
86  //char *pos = strstr(fileName, invalidStr);
87  // initial ./ is valid for present directory
88  //if (*pos == '.') pos++;
89  //if (*pos == '/') pos++;
90  for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) {
91  //printf("\tfound: %s, ", pos);
92  pos[0] = '_';
93  //printf("fileName: %s\n", fileName);
94  }
95  }
96 }
97 
98 void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
99  //printf("[DS] populateGPUEnvFromDevice\n");
100  size_t size;
101  gpuInfo->mnIsUserCreated = 1;
102  // device
103  gpuInfo->mpDevID = device;
104  gpuInfo->mpArryDevsID = new cl_device_id[1];
105  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
106  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE , sizeof(cl_device_type), (void *) &gpuInfo->mDevType , &size);
107  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
108  // platform
109  clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID , &size);
110  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
111  // context
112  cl_context_properties props[3];
113  props[0] = CL_CONTEXT_PLATFORM;
114  props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
115  props[2] = 0;
116  gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus);
117  CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
118  // queue
119  cl_command_queue_properties queueProperties = 0;
120  gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
121  CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
122 
123 }
124 
125 int OpenclDevice::LoadOpencl()
126 {
127 #ifdef WIN32
128  HINSTANCE HOpenclDll = NULL;
129  void * OpenclDll = NULL;
130  //fprintf(stderr, " LoadOpenclDllxx... \n");
131  OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
132  OpenclDll = LoadLibrary( "openCL.dll" );
133  if ( !static_cast<HINSTANCE>( OpenclDll ) )
134  {
135  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
136  FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
137  return 0;
138 
139  }
140  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
141 #endif
142  return 1;
143 }
144 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
145 {
146  envInfo->mpkContext = gpuEnv.mpContext;
147  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
148  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
149 
150  return 1;
151 }
152 
153 cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus)
154 {
155  cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
156  nElements * sizeof(l_uint32), hostbuffer, pStatus);
157 
158  return membuffer;
159 }
160 
161 PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true)
162 {
163  PROCNAME("mapOutputCLBuffer");
164  if (!pixd)
165  {
166  if (memcopy)
167  {
168  if ((pixd = pixCreateTemplate(pixs)) == NULL)
169  (PIX *)ERROR_PTR("pixd not made", procName, NULL);
170  }
171  else
172  {
173  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL)
174  (PIX *)ERROR_PTR("pixd not made", procName, NULL);
175  }
176  }
177  l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
178  elements * sizeof(l_uint32), 0, NULL, NULL, NULL );
179 
180  if (memcopy)
181  {
182  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
183  }
184  else
185  {
186  pixSetData(pixd, pValues);
187  }
188 
189  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL);
190 
191  if (sync)
192  {
193  clFinish( rEnv.mpkCmdQueue );
194  }
195 
196  return pixd;
197 }
198 
199  cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false)
200 {
201  cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
202  nElements * sizeof(l_int32), NULL, pStatus);
203 
204  if (_pValues != NULL)
205  {
206  l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
207  nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
208 
209  memcpy(pValues, _pValues, nElements * sizeof(l_int32));
210 
211  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
212 
213  if (sync)
214  clFinish( rEnv.mpkCmdQueue );
215  }
216 
217  return xValues;
218 }
219 
220 
221 void OpenclDevice::releaseMorphCLBuffers()
222 {
223  if (pixdCLIntermediate != NULL)
224  clReleaseMemObject(pixdCLIntermediate);
225  if (pixsCLBuffer != NULL)
226  clReleaseMemObject(pixsCLBuffer);
227  if (pixdCLBuffer != NULL)
228  clReleaseMemObject(pixdCLBuffer);
229  if (pixThBuffer != NULL)
230  clReleaseMemObject(pixThBuffer);
231 }
232 
233 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs)
234 {
235  SetKernelEnv( &rEnv );
236 
237  if (pixThBuffer != NULL)
238  {
239  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
240 
241  //Get the output from ThresholdToPix operation
242  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL);
243  }
244  else
245  {
246  //Get data from the source image
247  l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
248  memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
249 
250  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
251  }
252 
253  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
254 
255  pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
256 
257  return (int)clStatus;
258 }
259 
260 int OpenclDevice::InitEnv()
261 {
262 //PERF_COUNT_START("OD::InitEnv")
263 // printf("[OD] OpenclDevice::InitEnv()\n");
264 #ifdef SAL_WIN32
265  while( 1 )
266  {
267  if( 1 == LoadOpencl() )
268  break;
269  }
270 PERF_COUNT_SUB("LoadOpencl")
271 #endif
272  // sets up environment, compiles programs
273 
274 
275  InitOpenclRunEnv_DeviceSelection( 0 );
276 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
277 //PERF_COUNT_END
278  return 1;
279 }
280 
281 int OpenclDevice::ReleaseOpenclRunEnv()
282 {
283  ReleaseOpenclEnv( &gpuEnv );
284 #ifdef SAL_WIN32
285  FreeOpenclDll();
286 #endif
287  return 1;
288 }
289 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
290 {
291  if ( kCount < 1 )
292  fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
293  strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
294  gpuEnv.mnKernelCount++;
295  return 0;
296 }
297 int OpenclDevice::RegistOpenclKernel()
298 {
299  if ( !gpuEnv.mnIsUserCreated )
300  memset( &gpuEnv, 0, sizeof(gpuEnv) );
301 
302  gpuEnv.mnFileCount = 0; //argc;
303  gpuEnv.mnKernelCount = 0UL;
304 
305  AddKernelConfig( 1, (const char*) "oclAverageSub1" );
306  return 0;
307 }
308 
309 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
310 //PERF_COUNT_START("InitOpenclRunEnv_DS")
311  if (!isInited) {
312  // after programs compiled, selects best device
313  //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
314  ds_device bestDevice_DS = getDeviceSelection( );
315 //PERF_COUNT_SUB("called getDeviceSelection()")
316  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
317  // overwrite global static GPUEnv with new device
318  if (selectedDeviceIsOpenCL() ) {
319  //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
320  populateGPUEnvFromDevice( &gpuEnv, bestDevice );
321  gpuEnv.mnFileCount = 0; //argc;
322  gpuEnv.mnKernelCount = 0UL;
323 //PERF_COUNT_SUB("populate gpuEnv")
324  CompileKernelFile(&gpuEnv, "");
325 //PERF_COUNT_SUB("CompileKernelFile")
326  } else {
327  //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
328  }
329  isInited = 1;
330  }
331 //PERF_COUNT_END
332  return 0;
333 }
334 
335 
336 OpenclDevice::OpenclDevice()
337 {
338  //InitEnv();
339 }
340 
341 OpenclDevice::~OpenclDevice()
342 {
343  //ReleaseOpenclRunEnv();
344 }
345 
346 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
347 {
348  int i = 0;
349  int clStatus = 0;
350 
351  if ( !isInited )
352  {
353  return 1;
354  }
355 
356  for ( i = 0; i < gpuEnv.mnFileCount; i++ )
357  {
358  if ( gpuEnv.mpArryPrograms[i] )
359  {
360  clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
361  CHECK_OPENCL( clStatus, "clReleaseProgram" );
362  gpuEnv.mpArryPrograms[i] = NULL;
363  }
364  }
365  if ( gpuEnv.mpCmdQueue )
366  {
367  clReleaseCommandQueue( gpuEnv.mpCmdQueue );
368  gpuEnv.mpCmdQueue = NULL;
369  }
370  if ( gpuEnv.mpContext )
371  {
372  clReleaseContext( gpuEnv.mpContext );
373  gpuEnv.mpContext = NULL;
374  }
375  isInited = 0;
376  gpuInfo->mnIsUserCreated = 0;
377  free( gpuInfo->mpArryDevsID );
378  return 1;
379 }
380 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
381 {
382  unsigned int i = 0;
383  cl_int clStatus;
384  int status = 0;
385  char *str = NULL;
386  FILE *fd = NULL;
387  char fileName[256] = { 0 }, cl_name[128] = { 0 };
388  char deviceName[1024];
389  clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
390  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
391  str = (char*) strstr( clFileName, (char*) ".cl" );
392  memcpy( cl_name, clFileName, str - clFileName );
393  cl_name[str - clFileName] = '\0';
394  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
395  legalizeFileName(fileName);
396  fd = fopen( fileName, "rb" );
397  status = ( fd != NULL ) ? 1 : 0;
398  if ( fd != NULL )
399  {
400  *fhandle = fd;
401  }
402  return status;
403 
404 }
405 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
406 {
407  int i;
408  for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
409  {
410  if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
411  {
412  if ( gpuEnvCached->mpArryPrograms[i] != NULL )
413  {
414  return 1;
415  }
416  }
417  }
418 
419  return 0;
420 }
421 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
422 {
423  FILE *output = NULL;
424  output = fopen( fileName, "wb" );
425  if ( output == NULL )
426  {
427  return 0;
428  }
429 
430  fwrite( birary, sizeof(char), numBytes, output );
431  fclose( output );
432 
433  return 1;
434 
435 }
436 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
437 {
438  unsigned int i = 0;
439  cl_int clStatus;
440  size_t *binarySizes, numDevices=0;
441  cl_device_id *mpArryDevsID;
442  char **binaries, *str = NULL;
443 
444  clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES,
445  sizeof(numDevices), &numDevices, NULL );
446  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
447 
448  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
449  if ( mpArryDevsID == NULL )
450  {
451  return 0;
452  }
453  /* grab the handles to all of the devices in the program. */
454  clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES,
455  sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL );
456  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
457 
458  /* figure out the sizes of each of the binaries. */
459  binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
460 
461  clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES,
462  sizeof(size_t) * numDevices, binarySizes, NULL );
463  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
464 
465  /* copy over all of the generated binaries. */
466  binaries = (char**) malloc( sizeof(char *) * numDevices );
467  if ( binaries == NULL )
468  {
469  return 0;
470  }
471 
472  for ( i = 0; i < numDevices; i++ )
473  {
474  if ( binarySizes[i] != 0 )
475  {
476  binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
477  if ( binaries[i] == NULL )
478  {
479  return 0;
480  }
481  }
482  else
483  {
484  binaries[i] = NULL;
485  }
486  }
487 
488  clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES,
489  sizeof(char *) * numDevices, binaries, NULL );
490  CHECK_OPENCL(clStatus,"clGetProgramInfo");
491 
492  /* dump out each binary into its own separate file. */
493  for ( i = 0; i < numDevices; i++ )
494  {
495  char fileName[256] = { 0 }, cl_name[128] = { 0 };
496 
497  if ( binarySizes[i] != 0 )
498  {
499  char deviceName[1024];
500  clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
501  sizeof(deviceName), deviceName, NULL);
502  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
503 
504  str = (char*) strstr( clFileName, (char*) ".cl" );
505  memcpy( cl_name, clFileName, str - clFileName );
506  cl_name[str - clFileName] = '\0';
507  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
508  legalizeFileName(fileName);
509  if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
510  {
511  printf("[OD] write binary[%s] failed\n", fileName);
512  return 0;
513  } //else
514  printf("[OD] write binary[%s] successfully\n", fileName);
515  }
516  }
517 
518  // Release all resouces and memory
519  for ( i = 0; i < numDevices; i++ )
520  {
521  if ( binaries[i] != NULL )
522  {
523  free( binaries[i] );
524  binaries[i] = NULL;
525  }
526  }
527 
528  if ( binaries != NULL )
529  {
530  free( binaries );
531  binaries = NULL;
532  }
533 
534  if ( binarySizes != NULL )
535  {
536  free( binarySizes );
537  binarySizes = NULL;
538  }
539 
540  if ( mpArryDevsID != NULL )
541  {
542  free( mpArryDevsID );
543  mpArryDevsID = NULL;
544  }
545  return 1;
546 }
547 
548 void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus )
549 {
550  l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
551  nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
552  clFinish( rEnv.mpkCmdQueue );
553  if (_pValues != NULL)
554  {
555  for ( int i = 0; i < (int)nElements; i++ )
556  pValues[i] = (l_int32)_pValues[i];
557  }
558 
559  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
560  //clFinish( rEnv.mpkCmdQueue );
561  return;
562 }
563 
564 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
565 {
566 //PERF_COUNT_START("CompileKernelFile")
567  cl_int clStatus = 0;
568  size_t length;
569  char *buildLog = NULL, *binary;
570  const char *source;
571  size_t source_size[1];
572  int b_error, binary_status, binaryExisted, idx;
573  size_t numDevices;
574  cl_device_id *mpArryDevsID;
575  FILE *fd, *fd1;
576  const char* filename = "kernel.cl";
577  //fprintf(stderr, "[OD] CompileKernelFile ... \n");
578  if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
579  {
580  return 1;
581  }
582 
583  idx = gpuInfo->mnFileCount;
584 
585  source = kernel_src;
586 
587  source_size[0] = strlen( source );
588  binaryExisted = 0;
589  binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
590 //PERF_COUNT_SUB("BinaryGenerated")
591  if ( binaryExisted == 1 )
592  {
593  clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
594  sizeof(numDevices), &numDevices, NULL );
595  CHECK_OPENCL( clStatus, "clGetContextInfo" );
596 
597  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
598  if ( mpArryDevsID == NULL )
599  {
600  return 0;
601  }
602 //PERF_COUNT_SUB("get numDevices")
603  b_error = 0;
604  length = 0;
605  b_error |= fseek( fd, 0, SEEK_END ) < 0;
606  b_error |= ( length = ftell(fd) ) <= 0;
607  b_error |= fseek( fd, 0, SEEK_SET ) < 0;
608  if ( b_error )
609  {
610  return 0;
611  }
612 
613  binary = (char*) malloc( length + 2 );
614  if ( !binary )
615  {
616  return 0;
617  }
618 
619  memset( binary, 0, length + 2 );
620  b_error |= fread( binary, 1, length, fd ) != length;
621 
622 
623  fclose( fd );
624 //PERF_COUNT_SUB("read file")
625  fd = NULL;
626  // grab the handles to all of the devices in the context.
627  clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
628  sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
629  CHECK_OPENCL( clStatus, "clGetContextInfo" );
630 //PERF_COUNT_SUB("get devices")
631  //fprintf(stderr, "[OD] Create kernel from binary\n");
632  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
633  mpArryDevsID, &length, (const unsigned char**) &binary,
634  &binary_status, &clStatus );
635  CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
636 //PERF_COUNT_SUB("clCreateProgramWithBinary")
637  free( binary );
638  free( mpArryDevsID );
639  mpArryDevsID = NULL;
640 //PERF_COUNT_SUB("binaryExisted")
641  }
642  else
643  {
644  // create a CL program using the kernel source
645  //fprintf(stderr, "[OD] Create kernel from source\n");
646  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
647  source_size, &clStatus);
648  CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
649 //PERF_COUNT_SUB("!binaryExisted")
650  }
651 
652  if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL )
653  {
654  return 0;
655  }
656 
657  //char options[512];
658  // create a cl program executable for all the devices specified
659  //printf("[OD] BuildProgram.\n");
660 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
661  if (!gpuInfo->mnIsUserCreated)
662  {
663  clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
664  buildOption, NULL, NULL);
665 //PERF_COUNT_SUB("clBuildProgram notUserCreated")
666  }
667  else
668  {
669  clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
670  buildOption, NULL, NULL);
671 //PERF_COUNT_SUB("clBuildProgram isUserCreated")
672  }
674  if ( clStatus != CL_SUCCESS )
675  {
676  printf ("BuildProgram error!\n");
677  if ( !gpuInfo->mnIsUserCreated )
678  {
679  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
680  CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
681  }
682  else
683  {
684  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
685  CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
686  }
687  if ( clStatus != CL_SUCCESS )
688  {
689  printf("opencl create build log fail\n");
690  return 0;
691  }
692  buildLog = (char*) malloc( length );
693  if ( buildLog == (char*) NULL )
694  {
695  return 0;
696  }
697  if ( !gpuInfo->mnIsUserCreated )
698  {
699  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
700  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
701  }
702  else
703  {
704  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
705  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
706  }
707  if ( clStatus != CL_SUCCESS )
708  {
709  printf("opencl program build info fail\n");
710  return 0;
711  }
712 
713  fd1 = fopen( "kernel-build.log", "w+" );
714  if ( fd1 != NULL )
715  {
716  fwrite( buildLog, sizeof(char), length, fd1 );
717  fclose( fd1 );
718  }
719 
720  free( buildLog );
721 //PERF_COUNT_SUB("build error log")
722  return 0;
723  }
724 
725  strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
726 //PERF_COUNT_SUB("strcpy")
727  if ( binaryExisted == 0 ) {
728  GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
729  PERF_COUNT_SUB("GenerateBinFromKernelSource")
730  }
731 
732  gpuInfo->mnFileCount += 1;
733 //PERF_COUNT_END
734  return 1;
735 }
736 
737 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
738 {
739 PERF_COUNT_START("pixReadFromTiffKernel")
740  cl_int clStatus;
741  KernelEnv rEnv;
742  size_t globalThreads[2];
743  size_t localThreads[2];
744  int gsize;
745  cl_mem valuesCl;
746  cl_mem outputCl;
747 
748  //global and local work dimensions for Horizontal pass
749  gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
750  globalThreads[0] = gsize;
751  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
752  globalThreads[1] = gsize;
753  localThreads[0] = GROUPSIZE_X;
754  localThreads[1] = GROUPSIZE_Y;
755 
756  SetKernelEnv( &rEnv );
757 
758  l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
759  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
760  CHECK_OPENCL( clStatus, "clCreateKernel");
761 
762  //Allocate input and output OCL buffers
763  valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
764  outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
765 
766  //Kernel arguments
767  clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl );
768  CHECK_OPENCL( clStatus, "clSetKernelArg");
769  clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w );
770  CHECK_OPENCL( clStatus, "clSetKernelArg" );
771  clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h );
772  CHECK_OPENCL( clStatus, "clSetKernelArg" );
773  clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl );
774  CHECK_OPENCL( clStatus, "clSetKernelArg" );
775  clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl );
776  CHECK_OPENCL( clStatus, "clSetKernelArg");
777 
778  //Kernel enqueue
779 PERF_COUNT_SUB("before")
780  clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL );
781  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
782 
783  /* map results back from gpu */
784  void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus);
785  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl");
786  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
787 
788  //Sync
789  clFinish( rEnv.mpkCmdQueue );
790 PERF_COUNT_SUB("kernel & map")
792  return pResult;
793 }
794 
795 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
796 {
797 PERF_COUNT_START("pixReadTiffCL")
798  FILE *fp;
799 PIX *pix;
800 
801  //printf("pixReadTiffCl file");
802  PROCNAME("pixReadTiff");
803 
804  if (!filename)
805  return (PIX *)ERROR_PTR("filename not defined", procName, NULL);
806 
807  if ((fp = fopenReadStream(filename)) == NULL)
808  return (PIX *)ERROR_PTR("image file not found", procName, NULL);
809  if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) {
810  fclose(fp);
811  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
812  }
813  fclose(fp);
815  return pix;
816 
817 }
818 TIFF *
819 OpenclDevice::fopenTiffCl(FILE *fp,
820  const char *modestring)
821 {
822 l_int32 fd;
823 
824  PROCNAME("fopenTiff");
825 
826  if (!fp)
827  return (TIFF *)ERROR_PTR("stream not opened", procName, NULL);
828  if (!modestring)
829  return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL);
830 
831  if ((fd = fileno(fp)) < 0)
832  return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL);
833  lseek(fd, 0, SEEK_SET);
834 
835  return TIFFFdOpen(fd, "TIFFstream", modestring);
836 }
837 l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF *tif,
838  l_int32 *pxres,
839  l_int32 *pyres)
840 {
841 l_uint16 resunit;
842 l_int32 foundxres, foundyres;
843 l_float32 fxres, fyres;
844 
845  PROCNAME("getTiffStreamResolution");
846 
847  if (!tif)
848  return ERROR_INT("tif not opened", procName, 1);
849  if (!pxres || !pyres)
850  return ERROR_INT("&xres and &yres not both defined", procName, 1);
851  *pxres = *pyres = 0;
852 
853  TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit);
854  foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres);
855  foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres);
856  if (!foundxres && !foundyres) return 1;
857  if (!foundxres && foundyres)
858  fxres = fyres;
859  else if (foundxres && !foundyres)
860  fyres = fxres;
861 
862  if (resunit == RESUNIT_CENTIMETER) { /* convert to ppi */
863  *pxres = (l_int32)(2.54 * fxres + 0.5);
864  *pyres = (l_int32)(2.54 * fyres + 0.5);
865  }
866  else {
867  *pxres = (l_int32)fxres;
868  *pyres = (l_int32)fyres;
869  }
870 
871  return 0;
872 }
873 
874 struct L_Memstream
875 {
876 l_uint8 *buffer; /* expands to hold data when written to; */
877  /* fixed size when read from. */
878 size_t bufsize; /* current size allocated when written to; */
879  /* fixed size of input data when read from. */
880 size_t offset; /* byte offset from beginning of buffer. */
881 size_t hw; /* high-water mark; max bytes in buffer. */
882 l_uint8 **poutdata; /* input param for writing; data goes here. */
883 size_t *poutsize; /* input param for writing; data size goes here. */
884 };
885 typedef struct L_Memstream L_MEMSTREAM;
886 
887 /* These are static functions for memory I/O */
888 static L_MEMSTREAM *memstreamCreateForRead(l_uint8 *indata, size_t pinsize);
889 static L_MEMSTREAM *memstreamCreateForWrite(l_uint8 **poutdata,
890  size_t *poutsize);
891 static tsize_t tiffReadCallback(thandle_t handle, tdata_t data, tsize_t length);
892 static tsize_t tiffWriteCallback(thandle_t handle, tdata_t data,
893  tsize_t length);
894 static toff_t tiffSeekCallback(thandle_t handle, toff_t offset, l_int32 whence);
895 static l_int32 tiffCloseCallback(thandle_t handle);
896 static toff_t tiffSizeCallback(thandle_t handle);
897 static l_int32 tiffMapCallback(thandle_t handle, tdata_t *data, toff_t *length);
898 static void tiffUnmapCallback(thandle_t handle, tdata_t data, toff_t length);
899 
900 
901 static L_MEMSTREAM *
902 memstreamCreateForRead(l_uint8 *indata,
903 size_t insize)
904 {
905  L_MEMSTREAM *mstream;
906 
907  mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
908  mstream->buffer = indata; /* handle to input data array */
909  mstream->bufsize = insize; /* amount of input data */
910  mstream->hw = insize; /* high-water mark fixed at input data size */
911  mstream->offset = 0; /* offset always starts at 0 */
912  return mstream;
913 }
914 
915 
916 static L_MEMSTREAM *
917 memstreamCreateForWrite(l_uint8 **poutdata,
918 size_t *poutsize)
919 {
920  L_MEMSTREAM *mstream;
921 
922  mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM));
923  mstream->buffer = (l_uint8 *)CALLOC(8 * 1024, 1);
924  mstream->bufsize = 8 * 1024;
925  mstream->poutdata = poutdata; /* used only at end of write */
926  mstream->poutsize = poutsize; /* ditto */
927  mstream->hw = mstream->offset = 0;
928  return mstream;
929 }
930 
931 
932 static tsize_t
933 tiffReadCallback(thandle_t handle,
934 tdata_t data,
935 tsize_t length)
936 {
937  L_MEMSTREAM *mstream;
938  size_t amount;
939 
940  mstream = (L_MEMSTREAM *)handle;
941  amount = L_MIN((size_t)length, mstream->hw - mstream->offset);
942  memcpy(data, mstream->buffer + mstream->offset, amount);
943  mstream->offset += amount;
944  return amount;
945 }
946 
947 
948 static tsize_t
949 tiffWriteCallback(thandle_t handle,
950 tdata_t data,
951 tsize_t length)
952 {
953  L_MEMSTREAM *mstream;
954  size_t newsize;
955 
956  /* reallocNew() uses calloc to initialize the array.
957  * If malloc is used instead, for some of the encoding methods,
958  * not all the data in 'bufsize' bytes in the buffer will
959  * have been initialized by the end of the compression. */
960  mstream = (L_MEMSTREAM *)handle;
961  if (mstream->offset + length > mstream->bufsize) {
962  newsize = 2 * (mstream->offset + length);
963  mstream->buffer = (l_uint8 *)reallocNew((void **)&mstream->buffer,
964  mstream->offset, newsize);
965  mstream->bufsize = newsize;
966  }
967 
968  memcpy(mstream->buffer + mstream->offset, data, length);
969  mstream->offset += length;
970  mstream->hw = L_MAX(mstream->offset, mstream->hw);
971  return length;
972 }
973 
974 
975 static toff_t
976 tiffSeekCallback(thandle_t handle,
977 toff_t offset,
978 l_int32 whence)
979 {
980  L_MEMSTREAM *mstream;
981 
982  PROCNAME("tiffSeekCallback");
983  mstream = (L_MEMSTREAM *)handle;
984  switch (whence) {
985  case SEEK_SET:
986  /* fprintf(stderr, "seek_set: offset = %d\n", offset); */
987  mstream->offset = offset;
988  break;
989  case SEEK_CUR:
990  /* fprintf(stderr, "seek_cur: offset = %d\n", offset); */
991  mstream->offset += offset;
992  break;
993  case SEEK_END:
994  /* fprintf(stderr, "seek end: hw = %d, offset = %d\n",
995  mstream->hw, offset); */
996  mstream->offset = mstream->hw - offset; /* offset >= 0 */
997  break;
998  default:
999  return (toff_t)ERROR_INT("bad whence value", procName,
1000  mstream->offset);
1001  }
1002 
1003  return mstream->offset;
1004 }
1005 
1006 
1007 static l_int32
1008 tiffCloseCallback(thandle_t handle)
1009 {
1010  L_MEMSTREAM *mstream;
1011 
1012  mstream = (L_MEMSTREAM *)handle;
1013  if (mstream->poutdata) { /* writing: save the output data */
1014  *mstream->poutdata = mstream->buffer;
1015  *mstream->poutsize = mstream->hw;
1016  }
1017  FREE(mstream); /* never free the buffer! */
1018  return 0;
1019 }
1020 
1021 
1022 static toff_t
1023 tiffSizeCallback(thandle_t handle)
1024 {
1025  L_MEMSTREAM *mstream;
1026 
1027  mstream = (L_MEMSTREAM *)handle;
1028  return mstream->hw;
1029 }
1030 
1031 
1032 static l_int32
1033 tiffMapCallback(thandle_t handle,
1034 tdata_t *data,
1035 toff_t *length)
1036 {
1037  L_MEMSTREAM *mstream;
1038 
1039  mstream = (L_MEMSTREAM *)handle;
1040  *data = mstream->buffer;
1041  *length = mstream->hw;
1042  return 0;
1043 }
1044 
1045 
1046 static void
1047 tiffUnmapCallback(thandle_t handle,
1048 tdata_t data,
1049 toff_t length)
1050 {
1051  return;
1052 }
1053 
1054 
1071 static TIFF *
1072 fopenTiffMemstream(const char *filename,
1073 const char *operation,
1074 l_uint8 **pdata,
1075 size_t *pdatasize)
1076 {
1077  L_MEMSTREAM *mstream;
1078 
1079  PROCNAME("fopenTiffMemstream");
1080 
1081  if (!filename)
1082  return (TIFF *)ERROR_PTR("filename not defined", procName, NULL);
1083  if (!operation)
1084  return (TIFF *)ERROR_PTR("operation not defined", procName, NULL);
1085  if (!pdata)
1086  return (TIFF *)ERROR_PTR("&data not defined", procName, NULL);
1087  if (!pdatasize)
1088  return (TIFF *)ERROR_PTR("&datasize not defined", procName, NULL);
1089  if (!strcmp(operation, "r") && !strcmp(operation, "w"))
1090  return (TIFF *)ERROR_PTR("operation not 'r' or 'w'}", procName, NULL);
1091 
1092  if (!strcmp(operation, "r"))
1093  mstream = memstreamCreateForRead(*pdata, *pdatasize);
1094  else
1095  mstream = memstreamCreateForWrite(pdata, pdatasize);
1096 
1097  return TIFFClientOpen(filename, operation, mstream,
1098  tiffReadCallback, tiffWriteCallback,
1099  tiffSeekCallback, tiffCloseCallback,
1100  tiffSizeCallback, tiffMapCallback,
1101  tiffUnmapCallback);
1102 }
1103 
1104 
1105 
1106 PIX *
1107 OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32 n)
1108 {
1109  l_int32 i, pagefound;
1110  PIX *pix;
1111  TIFF *tif;
1112  //L_MEMSTREAM *memStream;
1113  PROCNAME("pixReadMemTiffCl");
1114 
1115  if (!data)
1116  return (PIX *)ERROR_PTR("data pointer is NULL", procName, NULL);
1117 
1118  if ((tif = fopenTiffMemstream("", "r", (l_uint8 **)&data, &size)) == NULL)
1119  return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
1120 
1121  pagefound = FALSE;
1122  pix = NULL;
1123  for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
1124  if (i == n) {
1125  pagefound = TRUE;
1126  if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
1127  TIFFCleanup(tif);
1128  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
1129  }
1130  break;
1131  }
1132  if (TIFFReadDirectory(tif) == 0)
1133  break;
1134  }
1135 
1136  if (pagefound == FALSE) {
1137  L_WARNING("tiff page %d not found", procName);
1138  TIFFCleanup(tif);
1139  return NULL;
1140  }
1141 
1142  TIFFCleanup(tif);
1143  return pix;
1144 }
1145 
1146 PIX *
1147 OpenclDevice::pixReadStreamTiffCl(FILE *fp,
1148  l_int32 n)
1149 {
1150 l_int32 i, pagefound;
1151 PIX *pix;
1152 TIFF *tif;
1153 
1154  PROCNAME("pixReadStreamTiff");
1155 
1156  if (!fp)
1157  return (PIX *)ERROR_PTR("stream not defined", procName, NULL);
1158 
1159  if ((tif = fopenTiffCl(fp, "rb")) == NULL)
1160  return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
1161 
1162  pagefound = FALSE;
1163  pix = NULL;
1164  for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
1165  if (i == n) {
1166  pagefound = TRUE;
1167  if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
1168  TIFFCleanup(tif);
1169  return (PIX *)ERROR_PTR("pix not read", procName, NULL);
1170  }
1171  break;
1172  }
1173  if (TIFFReadDirectory(tif) == 0)
1174  break;
1175  }
1176 
1177  if (pagefound == FALSE) {
1178  L_WARNING("tiff page %d not found", procName, n);
1179  TIFFCleanup(tif);
1180  return NULL;
1181  }
1182 
1183  TIFFCleanup(tif);
1184  return pix;
1185 }
1186 
1187 static l_int32
1188 getTiffCompressedFormat(l_uint16 tiffcomp)
1189 {
1190 l_int32 comptype;
1191 
1192  switch (tiffcomp)
1193  {
1194  case COMPRESSION_CCITTFAX4:
1195  comptype = IFF_TIFF_G4;
1196  break;
1197  case COMPRESSION_CCITTFAX3:
1198  comptype = IFF_TIFF_G3;
1199  break;
1200  case COMPRESSION_CCITTRLE:
1201  comptype = IFF_TIFF_RLE;
1202  break;
1203  case COMPRESSION_PACKBITS:
1204  comptype = IFF_TIFF_PACKBITS;
1205  break;
1206  case COMPRESSION_LZW:
1207  comptype = IFF_TIFF_LZW;
1208  break;
1209  case COMPRESSION_ADOBE_DEFLATE:
1210  comptype = IFF_TIFF_ZIP;
1211  break;
1212  default:
1213  comptype = IFF_TIFF;
1214  break;
1215  }
1216  return comptype;
1217 }
1218 
1219 void compare(l_uint32 *cpu, l_uint32 *gpu,int size)
1220 {
1221  for(int i=0;i<size;i++)
1222  {
1223  if(cpu[i]!=gpu[i])
1224  {
1225  printf("\ndoesnot match\n");
1226  return;
1227  }
1228  }
1229  printf("\nit matches\n");
1230 
1231 }
1232 
1233 //OpenCL implementation of pixReadFromTiffStream.
1234 //Similar to the CPU implentation of pixReadFromTiffStream
1235 PIX *
1236 OpenclDevice::pixReadFromTiffStreamCl(TIFF *tif)
1237 {
1238 l_uint8 *linebuf, *data;
1239 l_uint16 spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation;
1240 l_uint16 *redmap, *greenmap, *bluemap;
1241 l_int32 d, wpl, bpl, comptype, i, ncolors;
1242 l_int32 xres, yres;
1243 l_uint32 w, h;
1244 l_uint32 *line, *tiffdata;
1245 PIX *pix;
1246 PIXCMAP *cmap;
1247 
1248  PROCNAME("pixReadFromTiffStream");
1249 
1250  if (!tif)
1251  return (PIX *)ERROR_PTR("tif not defined", procName, NULL);
1252 
1253 
1254  TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps);
1255  TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
1256  bpp = bps * spp;
1257  if (bpp > 32)
1258  return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL);
1259  if (spp == 1)
1260  d = bps;
1261  else if (spp == 3 || spp == 4)
1262  d = 32;
1263  else
1264  return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL);
1265 
1266  TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w);
1267  TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h);
1268  tiffbpl = TIFFScanlineSize(tif);
1269 
1270  if ((pix = pixCreate(w, h, d)) == NULL)
1271  return (PIX *)ERROR_PTR("pix not made", procName, NULL);
1272  data = (l_uint8 *)pixGetData(pix);
1273  wpl = pixGetWpl(pix);
1274  bpl = 4 * wpl;
1275 
1276 
1277  if (spp == 1) {
1278  if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL)
1279  return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL);
1280 
1281  for (i = 0 ; i < h ; i++) {
1282  if (TIFFReadScanline(tif, linebuf, i, 0) < 0) {
1283  FREE(linebuf);
1284  pixDestroy(&pix);
1285  return (PIX *)ERROR_PTR("line read fail", procName, NULL);
1286  }
1287  memcpy((char *)data, (char *)linebuf, tiffbpl);
1288  data += bpl;
1289  }
1290  if (bps <= 8)
1291  pixEndianByteSwap(pix);
1292  else
1293  pixEndianTwoByteSwap(pix);
1294  FREE(linebuf);
1295  }
1296  else {
1297  if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) {
1298  pixDestroy(&pix);
1299  return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL);
1300  }
1301  if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata,
1302  ORIENTATION_TOPLEFT, 0)) {
1303  FREE(tiffdata);
1304  pixDestroy(&pix);
1305  return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL);
1306  }
1307  line = pixGetData(pix);
1308 
1309  //Invoke the OpenCL kernel for pixReadFromTiff
1310  l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
1311 
1312  pixSetData(pix, output_gpu);
1313  // pix already has data allocated, it now points to output_gpu?
1314  FREE(tiffdata);
1315  FREE(line);
1316  //FREE(output_gpu);
1317  }
1318 
1319  if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
1320  pixSetXRes(pix, xres);
1321  pixSetYRes(pix, yres);
1322  }
1323 
1324 
1325  TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp);
1326  comptype = getTiffCompressedFormat(tiffcomp);
1327  pixSetInputFormat(pix, comptype);
1328 
1329  if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) {
1330 
1331  if ((cmap = pixcmapCreate(bps)) == NULL) {
1332  pixDestroy(&pix);
1333  return (PIX *)ERROR_PTR("cmap not made", procName, NULL);
1334  }
1335  ncolors = 1 << bps;
1336  for (i = 0; i < ncolors; i++)
1337  pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8,
1338  bluemap[i] >> 8);
1339  pixSetColormap(pix, cmap);
1340  }
1341  else {
1342  if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) {
1343 
1344  if (tiffcomp == COMPRESSION_CCITTFAX3 ||
1345  tiffcomp == COMPRESSION_CCITTFAX4 ||
1346  tiffcomp == COMPRESSION_CCITTRLE ||
1347  tiffcomp == COMPRESSION_CCITTRLEW) {
1348  photometry = PHOTOMETRIC_MINISWHITE;
1349  }
1350  else
1351  photometry = PHOTOMETRIC_MINISBLACK;
1352  }
1353  if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) ||
1354  (d == 8 && photometry == PHOTOMETRIC_MINISWHITE))
1355  pixInvert(pix, pix);
1356  }
1357 
1358  if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) {
1359  if (orientation >= 1 && orientation <= 8) {
1360  struct tiff_transform *transform =
1361  &tiff_orientation_transforms[orientation - 1];
1362  if (transform->vflip) pixFlipTB(pix, pix);
1363  if (transform->hflip) pixFlipLR(pix, pix);
1364  if (transform->rotate) {
1365  PIX *oldpix = pix;
1366  pix = pixRotate90(oldpix, transform->rotate);
1367  pixDestroy(&oldpix);
1368  }
1369  }
1370  }
1371 
1372  return pix;
1373 }
1374 
1375 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1376 cl_int
1377 pixDilateCL_55(l_int32 wpl, l_int32 h)
1378 {
1379  size_t globalThreads[2];
1380  cl_mem pixtemp;
1381  cl_int status;
1382  int gsize;
1383  size_t localThreads[2];
1384 
1385  //Horizontal pass
1386  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1387  globalThreads[0] = gsize;
1388  globalThreads[1] = GROUPSIZE_HMORY;
1389  localThreads[0] = GROUPSIZE_HMORX;
1390  localThreads[1] = GROUPSIZE_HMORY;
1391 
1392  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
1393 
1394  status = clSetKernelArg(rEnv.mpkKernel,
1395  0,
1396  sizeof(cl_mem),
1397  &pixsCLBuffer);
1398  status = clSetKernelArg(rEnv.mpkKernel,
1399  1,
1400  sizeof(cl_mem),
1401  &pixdCLBuffer);
1402  status = clSetKernelArg(rEnv.mpkKernel,
1403  2,
1404  sizeof(wpl),
1405  (const void *)&wpl);
1406  status = clSetKernelArg(rEnv.mpkKernel,
1407  3,
1408  sizeof(h),
1409  (const void *)&h);
1410 
1411  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1412  rEnv.mpkKernel,
1413  2,
1414  NULL,
1415  globalThreads,
1416  localThreads,
1417  0,
1418  NULL,
1419  NULL);
1420 
1421  //Swap source and dest buffers
1422  pixtemp = pixsCLBuffer;
1423  pixsCLBuffer = pixdCLBuffer;
1424  pixdCLBuffer = pixtemp;
1425 
1426  //Vertical
1427  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1428  globalThreads[0] = gsize;
1429  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1430  globalThreads[1] = gsize;
1431  localThreads[0] = GROUPSIZE_X;
1432  localThreads[1] = GROUPSIZE_Y;
1433 
1434  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
1435 
1436  status = clSetKernelArg(rEnv.mpkKernel,
1437  0,
1438  sizeof(cl_mem),
1439  &pixsCLBuffer);
1440  status = clSetKernelArg(rEnv.mpkKernel,
1441  1,
1442  sizeof(cl_mem),
1443  &pixdCLBuffer);
1444  status = clSetKernelArg(rEnv.mpkKernel,
1445  2,
1446  sizeof(wpl),
1447  (const void *)&wpl);
1448  status = clSetKernelArg(rEnv.mpkKernel,
1449  3,
1450  sizeof(h),
1451  (const void *)&h);
1452  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1453  rEnv.mpkKernel,
1454  2,
1455  NULL,
1456  globalThreads,
1457  localThreads,
1458  0,
1459  NULL,
1460  NULL);
1461 
1462  return status;
1463 }
1464 
1465 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1466 cl_int
1467 pixErodeCL_55(l_int32 wpl, l_int32 h)
1468 {
1469  size_t globalThreads[2];
1470  cl_mem pixtemp;
1471  cl_int status;
1472  int gsize;
1473  l_uint32 fwmask, lwmask;
1474  size_t localThreads[2];
1475 
1476  lwmask = lmask32[32 - 2];
1477  fwmask = rmask32[32 - 2];
1478 
1479  //Horizontal pass
1480  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1481  globalThreads[0] = gsize;
1482  globalThreads[1] = GROUPSIZE_HMORY;
1483  localThreads[0] = GROUPSIZE_HMORX;
1484  localThreads[1] = GROUPSIZE_HMORY;
1485 
1486  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
1487 
1488  status = clSetKernelArg(rEnv.mpkKernel,
1489  0,
1490  sizeof(cl_mem),
1491  &pixsCLBuffer);
1492  status = clSetKernelArg(rEnv.mpkKernel,
1493  1,
1494  sizeof(cl_mem),
1495  &pixdCLBuffer);
1496  status = clSetKernelArg(rEnv.mpkKernel,
1497  2,
1498  sizeof(wpl),
1499  (const void *)&wpl);
1500  status = clSetKernelArg(rEnv.mpkKernel,
1501  3,
1502  sizeof(h),
1503  (const void *)&h);
1504 
1505  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1506  rEnv.mpkKernel,
1507  2,
1508  NULL,
1509  globalThreads,
1510  localThreads,
1511  0,
1512  NULL,
1513  NULL);
1514 
1515  //Swap source and dest buffers
1516  pixtemp = pixsCLBuffer;
1517  pixsCLBuffer = pixdCLBuffer;
1518  pixdCLBuffer = pixtemp;
1519 
1520  //Vertical
1521  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1522  globalThreads[0] = gsize;
1523  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1524  globalThreads[1] = gsize;
1525  localThreads[0] = GROUPSIZE_X;
1526  localThreads[1] = GROUPSIZE_Y;
1527 
1528  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
1529 
1530  status = clSetKernelArg(rEnv.mpkKernel,
1531  0,
1532  sizeof(cl_mem),
1533  &pixsCLBuffer);
1534  status = clSetKernelArg(rEnv.mpkKernel,
1535  1,
1536  sizeof(cl_mem),
1537  &pixdCLBuffer);
1538  status = clSetKernelArg(rEnv.mpkKernel,
1539  2,
1540  sizeof(wpl),
1541  (const void *)&wpl);
1542  status = clSetKernelArg(rEnv.mpkKernel,
1543  3,
1544  sizeof(h),
1545  (const void *)&h);
1546  status = clSetKernelArg(rEnv.mpkKernel,
1547  4,
1548  sizeof(fwmask),
1549  (const void *)&fwmask);
1550  status = clSetKernelArg(rEnv.mpkKernel,
1551  5,
1552  sizeof(lwmask),
1553  (const void *)&lwmask);
1554  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1555  rEnv.mpkKernel,
1556  2,
1557  NULL,
1558  globalThreads,
1559  localThreads,
1560  0,
1561  NULL,
1562  NULL);
1563 
1564  return status;
1565 }
1566 
1567 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
1568 cl_int
1569 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1570 {
1571  l_int32 xp, yp, xn, yn;
1572  SEL* sel;
1573  size_t globalThreads[2];
1574  cl_mem pixtemp;
1575  cl_int status;
1576  int gsize;
1577  size_t localThreads[2];
1578  char isEven;
1579 
1580  OpenclDevice::SetKernelEnv( &rEnv );
1581 
1582  if (hsize == 5 && vsize == 5)
1583  {
1584  //Specific case for 5x5
1585  status = pixDilateCL_55(wpl, h);
1586  return status;
1587  }
1588 
1589  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1590 
1591  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1592  selDestroy(&sel);
1593  //global and local work dimensions for Horizontal pass
1594  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1595  globalThreads[0] = gsize;
1596  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1597  globalThreads[1] = gsize;
1598  localThreads[0] = GROUPSIZE_X;
1599  localThreads[1] = GROUPSIZE_Y;
1600 
1601  if (xp > 31 || xn > 31)
1602  {
1603  //Generic case.
1604  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status );
1605 
1606  status = clSetKernelArg(rEnv.mpkKernel,
1607  0,
1608  sizeof(cl_mem),
1609  &pixsCLBuffer);
1610  status = clSetKernelArg(rEnv.mpkKernel,
1611  1,
1612  sizeof(cl_mem),
1613  &pixdCLBuffer);
1614  status = clSetKernelArg(rEnv.mpkKernel,
1615  2,
1616  sizeof(xp),
1617  (const void *)&xp);
1618  status = clSetKernelArg(rEnv.mpkKernel,
1619  3,
1620  sizeof(xn),
1621  (const void *)&xn);
1622  status = clSetKernelArg(rEnv.mpkKernel,
1623  4,
1624  sizeof(wpl),
1625  (const void *)&wpl);
1626  status = clSetKernelArg(rEnv.mpkKernel,
1627  5,
1628  sizeof(h),
1629  (const void *)&h);
1630  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1631  rEnv.mpkKernel,
1632  2,
1633  NULL,
1634  globalThreads,
1635  localThreads,
1636  0,
1637  NULL,
1638  NULL);
1639 
1640  if (yp > 0 || yn > 0)
1641  {
1642  pixtemp = pixsCLBuffer;
1643  pixsCLBuffer = pixdCLBuffer;
1644  pixdCLBuffer = pixtemp;
1645  }
1646  }
1647  else if (xp > 0 || xn > 0 )
1648  {
1649  //Specific Horizontal pass kernel for half width < 32
1650  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status );
1651  isEven = (xp != xn);
1652 
1653  status = clSetKernelArg(rEnv.mpkKernel,
1654  0,
1655  sizeof(cl_mem),
1656  &pixsCLBuffer);
1657  status = clSetKernelArg(rEnv.mpkKernel,
1658  1,
1659  sizeof(cl_mem),
1660  &pixdCLBuffer);
1661  status = clSetKernelArg(rEnv.mpkKernel,
1662  2,
1663  sizeof(xp),
1664  (const void *)&xp);
1665  status = clSetKernelArg(rEnv.mpkKernel,
1666  3,
1667  sizeof(wpl),
1668  (const void *)&wpl);
1669  status = clSetKernelArg(rEnv.mpkKernel,
1670  4,
1671  sizeof(h),
1672  (const void *)&h);
1673  status = clSetKernelArg(rEnv.mpkKernel,
1674  5,
1675  sizeof(isEven),
1676  (const void *)&isEven);
1677  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1678  rEnv.mpkKernel,
1679  2,
1680  NULL,
1681  globalThreads,
1682  localThreads,
1683  0,
1684  NULL,
1685  NULL);
1686 
1687  if (yp > 0 || yn > 0)
1688  {
1689  pixtemp = pixsCLBuffer;
1690  pixsCLBuffer = pixdCLBuffer;
1691  pixdCLBuffer = pixtemp;
1692  }
1693  }
1694 
1695  if (yp > 0 || yn > 0)
1696  {
1697  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
1698 
1699  status = clSetKernelArg(rEnv.mpkKernel,
1700  0,
1701  sizeof(cl_mem),
1702  &pixsCLBuffer);
1703  status = clSetKernelArg(rEnv.mpkKernel,
1704  1,
1705  sizeof(cl_mem),
1706  &pixdCLBuffer);
1707  status = clSetKernelArg(rEnv.mpkKernel,
1708  2,
1709  sizeof(yp),
1710  (const void *)&yp);
1711  status = clSetKernelArg(rEnv.mpkKernel,
1712  3,
1713  sizeof(wpl),
1714  (const void *)&wpl);
1715  status = clSetKernelArg(rEnv.mpkKernel,
1716  4,
1717  sizeof(h),
1718  (const void *)&h);
1719  status = clSetKernelArg(rEnv.mpkKernel,
1720  5,
1721  sizeof(yn),
1722  (const void *)&yn);
1723  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1724  rEnv.mpkKernel,
1725  2,
1726  NULL,
1727  globalThreads,
1728  localThreads,
1729  0,
1730  NULL,
1731  NULL);
1732  }
1733 
1734 
1735  return status;
1736 }
1737 
1738 //Morphology Erode operation. Invokes the relevant OpenCL kernels
1739 cl_int
1740 pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h)
1741 {
1742 
1743  l_int32 xp, yp, xn, yn;
1744  SEL* sel;
1745  size_t globalThreads[2];
1746  size_t localThreads[2];
1747  cl_mem pixtemp;
1748  cl_int status;
1749  int gsize;
1750  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1751  l_uint32 rwmask, lwmask;
1752  char isEven;
1753 
1754  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1755 
1756  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1757  selDestroy(&sel);
1758  OpenclDevice::SetKernelEnv( &rEnv );
1759 
1760  if (hsize == 5 && vsize == 5 && isAsymmetric)
1761  {
1762  //Specific kernel for 5x5
1763  status = pixErodeCL_55(wpl, h);
1764  return status;
1765  }
1766 
1767  rwmask = rmask32[32 - (xp & 31)];
1768  lwmask = lmask32[32 - (xn & 31)];
1769 
1770  //global and local work dimensions for Horizontal pass
1771  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1772  globalThreads[0] = gsize;
1773  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1774  globalThreads[1] = gsize;
1775  localThreads[0] = GROUPSIZE_X;
1776  localThreads[1] = GROUPSIZE_Y;
1777 
1778  //Horizontal Pass
1779  if (xp > 31 || xn > 31 )
1780  {
1781  //Generic case.
1782  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status );
1783 
1784  status = clSetKernelArg(rEnv.mpkKernel,
1785  0,
1786  sizeof(cl_mem),
1787  &pixsCLBuffer);
1788  status = clSetKernelArg(rEnv.mpkKernel,
1789  1,
1790  sizeof(cl_mem),
1791  &pixdCLBuffer);
1792  status = clSetKernelArg(rEnv.mpkKernel,
1793  2,
1794  sizeof(xp),
1795  (const void *)&xp);
1796  status = clSetKernelArg(rEnv.mpkKernel,
1797  3,
1798  sizeof(xn),
1799  (const void *)&xn);
1800  status = clSetKernelArg(rEnv.mpkKernel,
1801  4,
1802  sizeof(wpl),
1803  (const void *)&wpl);
1804  status = clSetKernelArg(rEnv.mpkKernel,
1805  5,
1806  sizeof(h),
1807  (const void *)&h);
1808  status = clSetKernelArg(rEnv.mpkKernel,
1809  6,
1810  sizeof(isAsymmetric),
1811  (const void *)&isAsymmetric);
1812  status = clSetKernelArg(rEnv.mpkKernel,
1813  7,
1814  sizeof(rwmask),
1815  (const void *)&rwmask);
1816  status = clSetKernelArg(rEnv.mpkKernel,
1817  8,
1818  sizeof(lwmask),
1819  (const void *)&lwmask);
1820  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1821  rEnv.mpkKernel,
1822  2,
1823  NULL,
1824  globalThreads,
1825  localThreads,
1826  0,
1827  NULL,
1828  NULL);
1829 
1830  if (yp > 0 || yn > 0)
1831  {
1832  pixtemp = pixsCLBuffer;
1833  pixsCLBuffer = pixdCLBuffer;
1834  pixdCLBuffer = pixtemp;
1835  }
1836  }
1837  else if (xp > 0 || xn > 0)
1838  {
1839  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status );
1840  isEven = (xp != xn);
1841 
1842  status = clSetKernelArg(rEnv.mpkKernel,
1843  0,
1844  sizeof(cl_mem),
1845  &pixsCLBuffer);
1846  status = clSetKernelArg(rEnv.mpkKernel,
1847  1,
1848  sizeof(cl_mem),
1849  &pixdCLBuffer);
1850  status = clSetKernelArg(rEnv.mpkKernel,
1851  2,
1852  sizeof(xp),
1853  (const void *)&xp);
1854  status = clSetKernelArg(rEnv.mpkKernel,
1855  3,
1856  sizeof(wpl),
1857  (const void *)&wpl);
1858  status = clSetKernelArg(rEnv.mpkKernel,
1859  4,
1860  sizeof(h),
1861  (const void *)&h);
1862  status = clSetKernelArg(rEnv.mpkKernel,
1863  5,
1864  sizeof(isAsymmetric),
1865  (const void *)&isAsymmetric);
1866  status = clSetKernelArg(rEnv.mpkKernel,
1867  6,
1868  sizeof(rwmask),
1869  (const void *)&rwmask);
1870  status = clSetKernelArg(rEnv.mpkKernel,
1871  7,
1872  sizeof(lwmask),
1873  (const void *)&lwmask);
1874  status = clSetKernelArg(rEnv.mpkKernel,
1875  8,
1876  sizeof(isEven),
1877  (const void *)&isEven);
1878  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1879  rEnv.mpkKernel,
1880  2,
1881  NULL,
1882  globalThreads,
1883  localThreads,
1884  0,
1885  NULL,
1886  NULL);
1887 
1888  if (yp > 0 || yn > 0)
1889  {
1890  pixtemp = pixsCLBuffer;
1891  pixsCLBuffer = pixdCLBuffer;
1892  pixdCLBuffer = pixtemp;
1893  }
1894  }
1895 
1896  //Vertical Pass
1897  if (yp > 0 || yn > 0)
1898  {
1899  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status );
1900 
1901  status = clSetKernelArg(rEnv.mpkKernel,
1902  0,
1903  sizeof(cl_mem),
1904  &pixsCLBuffer);
1905  status = clSetKernelArg(rEnv.mpkKernel,
1906  1,
1907  sizeof(cl_mem),
1908  &pixdCLBuffer);
1909  status = clSetKernelArg(rEnv.mpkKernel,
1910  2,
1911  sizeof(yp),
1912  (const void *)&yp);
1913  status = clSetKernelArg(rEnv.mpkKernel,
1914  3,
1915  sizeof(wpl),
1916  (const void *)&wpl);
1917  status = clSetKernelArg(rEnv.mpkKernel,
1918  4,
1919  sizeof(h),
1920  (const void *)&h);
1921  status = clSetKernelArg(rEnv.mpkKernel,
1922  5,
1923  sizeof(isAsymmetric),
1924  (const void *)&isAsymmetric);
1925  status = clSetKernelArg(rEnv.mpkKernel,
1926  6,
1927  sizeof(yn),
1928  (const void *)&yn);
1929  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
1930  rEnv.mpkKernel,
1931  2,
1932  NULL,
1933  globalThreads,
1934  localThreads,
1935  0,
1936  NULL,
1937  NULL);
1938  }
1939 
1940  return status;
1941 }
1942 
1943 // OpenCL implementation of Morphology Dilate
1944 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1945 PIX*
1946 OpenclDevice::pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false)
1947 {
1948  l_uint32 wpl, h;
1949 
1950  wpl = pixGetWpl(pixs);
1951  h = pixGetHeight(pixs);
1952 
1953  clStatus = pixDilateCL(hsize, vsize, wpl, h);
1954 
1955  if (reqDataCopy)
1956  {
1957  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false);
1958  }
1959 
1960  return pixd;
1961 }
1962 
1963 // OpenCL implementation of Morphology Erode
1964 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1965 PIX*
1966 OpenclDevice::pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false)
1967 {
1968  l_uint32 wpl, h;
1969 
1970  wpl = pixGetWpl(pixs);
1971  h = pixGetHeight(pixs);
1972 
1973  clStatus = pixErodeCL(hsize, vsize, wpl, h);
1974 
1975  if (reqDataCopy)
1976  {
1977  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
1978  }
1979 
1980  return pixd;
1981 }
1982 
1983 //Morphology Open operation. Invokes the relevant OpenCL kernels
1984 cl_int
1985 pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1986 {
1987  cl_int status;
1988  cl_mem pixtemp;
1989 
1990  //Erode followed by Dilate
1991  status = pixErodeCL(hsize, vsize, wpl, h);
1992 
1993  pixtemp = pixsCLBuffer;
1994  pixsCLBuffer = pixdCLBuffer;
1995  pixdCLBuffer = pixtemp;
1996 
1997  status = pixDilateCL(hsize, vsize, wpl, h);
1998 
1999  return status;
2000 }
2001 
2002 //Morphology Close operation. Invokes the relevant OpenCL kernels
2003 cl_int
2004 pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
2005 {
2006  cl_int status;
2007  cl_mem pixtemp;
2008 
2009  //Dilate followed by Erode
2010  status = pixDilateCL(hsize, vsize, wpl, h);
2011 
2012  pixtemp = pixsCLBuffer;
2013  pixsCLBuffer = pixdCLBuffer;
2014  pixdCLBuffer = pixtemp;
2015 
2016  status = pixErodeCL(hsize, vsize, wpl, h);
2017 
2018  return status;
2019 }
2020 
2021 // OpenCL implementation of Morphology Close
2022 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2023 PIX*
2024 OpenclDevice::pixCloseBrickCL(PIX *pixd,
2025  PIX *pixs,
2026  l_int32 hsize,
2027  l_int32 vsize,
2028  bool reqDataCopy = false)
2029 {
2030  l_uint32 wpl, h;
2031 
2032  wpl = pixGetWpl(pixs);
2033  h = pixGetHeight(pixs);
2034 
2035  clStatus = pixCloseCL(hsize, vsize, wpl, h);
2036 
2037  if (reqDataCopy)
2038  {
2039  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
2040  }
2041 
2042  return pixd;
2043 }
2044 
2045 // OpenCL implementation of Morphology Open
2046 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2047 PIX*
2048 OpenclDevice::pixOpenBrickCL(PIX *pixd,
2049  PIX *pixs,
2050  l_int32 hsize,
2051  l_int32 vsize,
2052  bool reqDataCopy = false)
2053 {
2054  l_uint32 wpl, h;
2055 
2056  wpl = pixGetWpl(pixs);
2057  h = pixGetHeight(pixs);
2058 
2059  clStatus = pixOpenCL(hsize, vsize, wpl, h);
2060 
2061  if (reqDataCopy)
2062  {
2063  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
2064  }
2065 
2066  return pixd;
2067 }
2068 
2069 //pix OR operation: outbuffer = buffer1 | buffer2
2070 cl_int
2071 pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
2072 {
2073  cl_int status;
2074  size_t globalThreads[2];
2075  int gsize;
2076  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
2077 
2078  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
2079  globalThreads[0] = gsize;
2080  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
2081  globalThreads[1] = gsize;
2082 
2083  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status );
2084 
2085  status = clSetKernelArg(rEnv.mpkKernel,
2086  0,
2087  sizeof(cl_mem),
2088  &buffer1);
2089  status = clSetKernelArg(rEnv.mpkKernel,
2090  1,
2091  sizeof(cl_mem),
2092  &buffer2);
2093  status = clSetKernelArg(rEnv.mpkKernel,
2094  2,
2095  sizeof(cl_mem),
2096  &outbuffer);
2097  status = clSetKernelArg(rEnv.mpkKernel,
2098  3,
2099  sizeof(wpl),
2100  (const void *)&wpl);
2101  status = clSetKernelArg(rEnv.mpkKernel,
2102  4,
2103  sizeof(h),
2104  (const void *)&h);
2105  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2106  rEnv.mpkKernel,
2107  2,
2108  NULL,
2109  globalThreads,
2110  localThreads,
2111  0,
2112  NULL,
2113  NULL);
2114 
2115  return status;
2116 }
2117 
2118 //pix AND operation: outbuffer = buffer1 & buffer2
2119 cl_int
2120 pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
2121 {
2122  cl_int status;
2123  size_t globalThreads[2];
2124  int gsize;
2125  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
2126 
2127  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
2128  globalThreads[0] = gsize;
2129  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
2130  globalThreads[1] = gsize;
2131 
2132  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status );
2133 
2134  // Enqueue a kernel run call.
2135  status = clSetKernelArg(rEnv.mpkKernel,
2136  0,
2137  sizeof(cl_mem),
2138  &buffer1);
2139  status = clSetKernelArg(rEnv.mpkKernel,
2140  1,
2141  sizeof(cl_mem),
2142  &buffer2);
2143  status = clSetKernelArg(rEnv.mpkKernel,
2144  2,
2145  sizeof(cl_mem),
2146  &outbuffer);
2147  status = clSetKernelArg(rEnv.mpkKernel,
2148  3,
2149  sizeof(wpl),
2150  (const void *)&wpl);
2151  status = clSetKernelArg(rEnv.mpkKernel,
2152  4,
2153  sizeof(h),
2154  (const void *)&h);
2155  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2156  rEnv.mpkKernel,
2157  2,
2158  NULL,
2159  globalThreads,
2160  localThreads,
2161  0,
2162  NULL,
2163  NULL);
2164 
2165  return status;
2166 }
2167 
2168 //output = buffer1 & ~(buffer2)
2169 cl_int
2170 pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL)
2171 {
2172  cl_int status;
2173  size_t globalThreads[2];
2174  int gsize;
2175  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
2176 
2177  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
2178  globalThreads[0] = gsize;
2179  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
2180  globalThreads[1] = gsize;
2181 
2182  if (outBuffer != NULL)
2183  {
2184  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status );
2185  }
2186  else
2187  {
2188  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status );
2189  }
2190 
2191  // Enqueue a kernel run call.
2192  status = clSetKernelArg(rEnv.mpkKernel,
2193  0,
2194  sizeof(cl_mem),
2195  &buffer1);
2196  status = clSetKernelArg(rEnv.mpkKernel,
2197  1,
2198  sizeof(cl_mem),
2199  &buffer2);
2200  status = clSetKernelArg(rEnv.mpkKernel,
2201  2,
2202  sizeof(wpl),
2203  (const void *)&wpl);
2204  status = clSetKernelArg(rEnv.mpkKernel,
2205  3,
2206  sizeof(h),
2207  (const void *)&h);
2208  if (outBuffer != NULL)
2209  {
2210  status = clSetKernelArg(rEnv.mpkKernel,
2211  4,
2212  sizeof(cl_mem),
2213  (const void *)&outBuffer);
2214  }
2215  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
2216  rEnv.mpkKernel,
2217  2,
2218  NULL,
2219  globalThreads,
2220  localThreads,
2221  0,
2222  NULL,
2223  NULL);
2224 
2225  return status;
2226 }
2227 
2228 // OpenCL implementation of Subtract pix
2229 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2230 PIX*
2231 OpenclDevice::pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy = false)
2232 {
2233  l_uint32 wpl, h;
2234 
2235  PROCNAME("pixSubtractCL");
2236 
2237  if (!pixs1)
2238  return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd);
2239  if (!pixs2)
2240  return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd);
2241  if (pixGetDepth(pixs1) != pixGetDepth(pixs2))
2242  return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd);
2243 
2244 #if EQUAL_SIZE_WARNING
2245  if (!pixSizesEqual(pixs1, pixs2))
2246  L_WARNING("pixs1 and pixs2 not equal sizes", procName);
2247 #endif /* EQUAL_SIZE_WARNING */
2248 
2249  wpl = pixGetWpl(pixs1);
2250  h = pixGetHeight(pixs1);
2251 
2252  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2253 
2254  if (reqDataCopy)
2255  {
2256  //Read back output data from OCL buffer to cpu
2257  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ);
2258  }
2259 
2260  return pixd;
2261 }
2262 
2263 // OpenCL implementation of Hollow pix
2264 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2265 PIX*
2266 OpenclDevice::pixHollowCL(PIX *pixd,
2267  PIX *pixs,
2268  l_int32 close_hsize,
2269  l_int32 close_vsize,
2270  l_int32 open_hsize,
2271  l_int32 open_vsize,
2272  bool reqDataCopy = false)
2273 {
2274  l_uint32 wpl, h;
2275  cl_mem pixtemp;
2276 
2277  wpl = pixGetWpl(pixs);
2278  h = pixGetHeight(pixs);
2279 
2280  //First step : Close Morph operation: Dilate followed by Erode
2281  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
2282 
2283  //Store the output of close operation in an intermediate buffer
2284  //this will be later used for pixsubtract
2285  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
2286 
2287  //Second step: Open Operation - Erode followed by Dilate
2288  pixtemp = pixsCLBuffer;
2289  pixsCLBuffer = pixdCLBuffer;
2290  pixdCLBuffer = pixtemp;
2291 
2292  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
2293 
2294  //Third step: Subtract : (Close - Open)
2295  pixtemp = pixsCLBuffer;
2296  pixsCLBuffer = pixdCLBuffer;
2297  pixdCLBuffer = pixdCLIntermediate;
2298  pixdCLIntermediate = pixtemp;
2299 
2300  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2301 
2302  if (reqDataCopy)
2303  {
2304  //Read back output data from OCL buffer to cpu
2305  pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
2306  }
2307  return pixd;
2308 }
2309 
2310 // OpenCL implementation of Get Lines from pix function
2311 //Note: Assumes the source and dest opencl buffer are initialized. No check done
2312 void
2313 OpenclDevice::pixGetLinesCL(PIX *pixd,
2314  PIX *pixs,
2315  PIX** pix_vline,
2316  PIX** pix_hline,
2317  PIX** pixClosed,
2318  bool getpixClosed,
2319  l_int32 close_hsize, l_int32 close_vsize,
2320  l_int32 open_hsize, l_int32 open_vsize,
2321  l_int32 line_hsize, l_int32 line_vsize)
2322 {
2323  l_uint32 wpl, h;
2324  cl_mem pixtemp;
2325 
2326  wpl = pixGetWpl(pixs);
2327  h = pixGetHeight(pixs);
2328 
2329  //First step : Close Morph operation: Dilate followed by Erode
2330  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
2331 
2332  //Copy the Close output to CPU buffer
2333  if (getpixClosed)
2334  {
2335  *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false);
2336  }
2337 
2338  //Store the output of close operation in an intermediate buffer
2339  //this will be later used for pixsubtract
2340  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
2341 
2342  //Second step: Open Operation - Erode followed by Dilate
2343  pixtemp = pixsCLBuffer;
2344  pixsCLBuffer = pixdCLBuffer;
2345  pixdCLBuffer = pixtemp;
2346 
2347  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
2348 
2349  //Third step: Subtract : (Close - Open)
2350  pixtemp = pixsCLBuffer;
2351  pixsCLBuffer = pixdCLBuffer;
2352  pixdCLBuffer = pixdCLIntermediate;
2353  pixdCLIntermediate = pixtemp;
2354 
2355  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
2356 
2357  //Store the output of Hollow operation in an intermediate buffer
2358  //this will be later used
2359  clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
2360 
2361  pixtemp = pixsCLBuffer;
2362  pixsCLBuffer = pixdCLBuffer;
2363  pixdCLBuffer = pixtemp;
2364 
2365  //Fourth step: Get vertical line
2366  //pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
2367  clStatus = pixOpenCL(1, line_vsize, wpl, h);
2368 
2369  //Copy the vertical line output to CPU buffer
2370  *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false);
2371 
2372  pixtemp = pixsCLBuffer;
2373  pixsCLBuffer = pixdCLIntermediate;
2374  pixdCLIntermediate = pixtemp;
2375 
2376  //Fifth step: Get horizontal line
2377  //pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
2378  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
2379 
2380  //Copy the horizontal line output to CPU buffer
2381  *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true);
2382 
2383  return;
2384 }
2385 
2386 
2387 /*************************************************************************
2388  * HistogramRect
2389  * Otsu Thresholding Operations
2390  * histogramAllChannels is laid out as all channel 0, then all channel 1...
2391  * only supports 1 or 4 channels (bytes_per_pixel)
2392  ************************************************************************/
2393 int OpenclDevice::HistogramRectOCL(
2394  const unsigned char* imageData,
2395  int bytes_per_pixel,
2396  int bytes_per_line,
2397  int left, // always 0
2398  int top, // always 0
2399  int width,
2400  int height,
2401  int kHistogramSize,
2402  int* histogramAllChannels)
2403 {
2404 PERF_COUNT_START("HistogramRectOCL")
2405  cl_int clStatus;
2406  int retVal= 0;
2407  KernelEnv histKern;
2408  SetKernelEnv( &histKern );
2409  KernelEnv histRedKern;
2410  SetKernelEnv( &histRedKern );
2411  /* map imagedata to device as read only */
2412  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
2413  // faster option would be to allocate initial image buffer
2414  // using a garlic bus memory type
2415  cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
2416  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2417 
2418  /* setup work group size parameters */
2419  int block_size = 256;
2420  cl_uint numCUs;
2421  clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
2422  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2423 
2424  int requestedOccupancy = 10;
2425  int numWorkGroups = numCUs * requestedOccupancy;
2426  int numThreads = block_size*numWorkGroups;
2427  size_t local_work_size[] = {static_cast<size_t>(block_size)};
2428  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
2429  size_t red_global_work_size[] = {static_cast<size_t>(block_size*kHistogramSize*bytes_per_pixel)};
2430 
2431  /* map histogramAllChannels as write only */
2432  int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
2433 
2434  cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus );
2435  CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer");
2436 
2437  /* intermediate histogram buffer */
2438  int histRed = 256;
2439  int tmpHistogramBins = kHistogramSize*bytes_per_pixel*histRed;
2440 
2441  cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus );
2442  CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer");
2443 
2444  /* atomic sync buffer */
2445  int *zeroBuffer = new int[1];
2446  zeroBuffer[0] = 0;
2447  cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
2448  CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
2449  delete[] zeroBuffer;
2450  //Create kernel objects based on bytes_per_pixel
2451  if (bytes_per_pixel == 1)
2452  {
2453  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus );
2454  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
2455 
2456  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus );
2457  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
2458  } else {
2459  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
2460  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
2461 
2462  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
2463  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
2464  }
2465 
2466  void *ptr;
2467 
2468  //Initialize tmpHistogramBuffer buffer
2469  ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus);
2470  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
2471 
2472  memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
2473  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL);
2474 
2475  /* set kernel 1 arguments */
2476  clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
2477  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2478  cl_uint numPixels = width*height;
2479  clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels );
2480  CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
2481  clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
2482  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
2483 
2484  /* set kernel 2 arguments */
2485  int n = numThreads/bytes_per_pixel;
2486  clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n );
2487  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2488  clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
2489  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
2490  clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer );
2491  CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
2492 
2493  /* launch histogram */
2494 PERF_COUNT_SUB("before")
2495  clStatus = clEnqueueNDRangeKernel(
2496  histKern.mpkCmdQueue,
2497  histKern.mpkKernel,
2498  1, NULL, global_work_size, local_work_size,
2499  0, NULL, NULL );
2500  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
2501  clFinish( histKern.mpkCmdQueue );
2502  if(clStatus !=0)
2503  {
2504  retVal = -1;
2505  }
2506  /* launch histogram */
2507  clStatus = clEnqueueNDRangeKernel(
2508  histRedKern.mpkCmdQueue,
2509  histRedKern.mpkKernel,
2510  1, NULL, red_global_work_size, local_work_size,
2511  0, NULL, NULL );
2512  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
2513  clFinish( histRedKern.mpkCmdQueue );
2514  if(clStatus !=0)
2515  {
2516  retVal = -1;
2517  }
2518 PERF_COUNT_SUB("redKernel")
2519 
2520  /* map results back from gpu */
2521  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
2522  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
2523  if(clStatus !=0)
2524  {
2525  retVal = -1;
2526  }
2527  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);
2528 
2529  clReleaseMemObject(histogramBuffer);
2530  clReleaseMemObject(imageBuffer);
2531 PERF_COUNT_SUB("after")
2533  return retVal;
2534 
2535 }
2536 
2537 /*************************************************************************
2538  * Threshold the rectangle, taking everything except the image buffer pointer
2539  * from the class, using thresholds/hi_values to the output IMAGE.
2540  * only supports 1 or 4 channels
2541  ************************************************************************/
2542 int OpenclDevice::ThresholdRectToPixOCL(
2543  const unsigned char* imageData,
2544  int bytes_per_pixel,
2545  int bytes_per_line,
2546  const int* thresholds,
2547  const int* hi_values,
2548  Pix** pix,
2549  int height,
2550  int width,
2551  int top,
2552  int left) {
2553 PERF_COUNT_START("ThresholdRectToPixOCL")
2554  int retVal =0;
2555  /* create pix result buffer */
2556  *pix = pixCreate(width, height, 1);
2557  uinT32* pixData = pixGetData(*pix);
2558  int wpl = pixGetWpl(*pix);
2559  int pixSize = wpl*height*sizeof(uinT32); // number of pixels
2560 
2561  cl_int clStatus;
2562  KernelEnv rEnv;
2563  SetKernelEnv( &rEnv );
2564 
2565  /* setup work group size parameters */
2566  int block_size = 256;
2567  cl_uint numCUs = 6;
2568  clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
2569  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2570 
2571  int requestedOccupancy = 10;
2572  int numWorkGroups = numCUs * requestedOccupancy;
2573  int numThreads = block_size*numWorkGroups;
2574  size_t local_work_size[] = {(size_t) block_size};
2575  size_t global_work_size[] = {(size_t) numThreads};
2576 
2577  /* map imagedata to device as read only */
2578  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
2579  // faster option would be to allocate initial image buffer
2580  // using a garlic bus memory type
2581  cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
2582  CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
2583 
2584  /* map pix as write only */
2585  pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus );
2586  CHECK_OPENCL( clStatus, "clCreateBuffer pix");
2587 
2588  /* map thresholds and hi_values */
2589  cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus );
2590  CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer");
2591  cl_mem hiValuesBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus );
2592  CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer");
2593 
2594  /* compile kernel */
2595  if (bytes_per_pixel == 4) {
2596  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus );
2597  CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix");
2598  } else {
2599  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus );
2600  CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
2601  }
2602 
2603  /* set kernel arguments */
2604  clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
2605  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
2606  cl_uint numPixels = width*height;
2607  clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height );
2608  CHECK_OPENCL( clStatus, "clSetKernelArg height" );
2609  clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width );
2610  CHECK_OPENCL( clStatus, "clSetKernelArg width" );
2611  clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl );
2612  CHECK_OPENCL( clStatus, "clSetKernelArg wpl" );
2613  clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer );
2614  CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" );
2615  clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer );
2616  CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" );
2617  clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer );
2618  CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer");
2619 
2620  /* launch kernel & wait */
2621 PERF_COUNT_SUB("before")
2622  clStatus = clEnqueueNDRangeKernel(
2623  rEnv.mpkCmdQueue,
2624  rEnv.mpkKernel,
2625  1, NULL, global_work_size, local_work_size,
2626  0, NULL, NULL );
2627  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
2628  clFinish( rEnv.mpkCmdQueue );
2629 PERF_COUNT_SUB("kernel")
2630  if(clStatus !=0)
2631  {
2632  printf("Setting return value to -1\n");
2633  retVal = -1;
2634  }
2635  /* map results back from gpu */
2636  void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
2637  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
2638  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL);
2639 
2640  clReleaseMemObject(imageBuffer);
2641  clReleaseMemObject(thresholdsBuffer);
2642  clReleaseMemObject(hiValuesBuffer);
2643 
2644 PERF_COUNT_SUB("after")
2646 return retVal;
2647 }
2648 
2649 
2650 
2651 /******************************************************************************
2652  * Data Types for Device Selection
2653  *****************************************************************************/
2654 
2655 typedef struct _TessScoreEvaluationInputData {
2656  int height;
2657  int width;
2658  int numChannels;
2659  unsigned char *imageData;
2660  Pix *pix;
2661 } TessScoreEvaluationInputData;
2662 
2663 void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) {
2664  srand(1);
2665  // 8.5x11 inches @ 300dpi rounded to clean multiples
2666  int height = 3328; // %256
2667  int width = 2560; // %512
2668  int numChannels = 4;
2669  input->height = height;
2670  input->width = width;
2671  input->numChannels = numChannels;
2672  unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
2673  input->imageData = (unsigned char *) &imageData4[0];
2674 
2675  // zero out image
2676  unsigned char pixelWhite[4] = { 0, 0, 0, 255};
2677  unsigned char pixelBlack[4] = {255, 255, 255, 255};
2678  for (int p = 0; p < height*width; p++) {
2679  //unsigned char tmp[4] = imageData4[0];
2680  imageData4[p][0] = pixelWhite[0];
2681  imageData4[p][1] = pixelWhite[1];
2682  imageData4[p][2] = pixelWhite[2];
2683  imageData4[p][3] = pixelWhite[3];
2684  }
2685  // random lines to be eliminated
2686  int maxLineWidth = 64; // pixels wide
2687  int numLines = 10;
2688  // vertical lines
2689  for (int i = 0; i < numLines; i++) {
2690  int lineWidth = rand()%maxLineWidth;
2691  int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
2692  //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
2693  for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2694  for (int col = 0; col < height; col++) {
2695  //imageData4[row*width+col] = pixelBlack;
2696  imageData4[row*width+col][0] = pixelBlack[0];
2697  imageData4[row*width+col][1] = pixelBlack[1];
2698  imageData4[row*width+col][2] = pixelBlack[2];
2699  imageData4[row*width+col][3] = pixelBlack[3];
2700  }
2701  }
2702  }
2703  // horizontal lines
2704  for (int i = 0; i < numLines; i++) {
2705  int lineWidth = rand()%maxLineWidth;
2706  int horLinePos = lineWidth + rand()%(height-2*lineWidth);
2707  //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
2708  for (int row = 0; row < width; row++) {
2709  for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2710  //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
2711  //imageData4[row*width+col] = pixelBlack;
2712  imageData4[row*width+col][0] = pixelBlack[0];
2713  imageData4[row*width+col][1] = pixelBlack[1];
2714  imageData4[row*width+col][2] = pixelBlack[2];
2715  imageData4[row*width+col][3] = pixelBlack[3];
2716  }
2717  }
2718  }
2719  // spots (noise, squares)
2720  float fractionBlack = 0.1; // how much of the image should be blackened
2721  int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
2722  for (int i = 0; i < numSpots; i++) {
2723 
2724  int lineWidth = rand()%maxLineWidth;
2725  int col = lineWidth + rand()%(width-2*lineWidth);
2726  int row = lineWidth + rand()%(height-2*lineWidth);
2727  //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
2728  for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
2729  for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
2730  //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
2731  //imageData4[row*width+col] = pixelBlack;
2732  imageData4[r*width+c][0] = pixelBlack[0];
2733  imageData4[r*width+c][1] = pixelBlack[1];
2734  imageData4[r*width+c][2] = pixelBlack[2];
2735  imageData4[r*width+c][3] = pixelBlack[3];
2736  }
2737  }
2738  }
2739 
2740  input->pix = pixCreate(input->width, input->height, 1);
2741 }
2742 
2743 typedef struct _TessDeviceScore {
2744  float time; // small time means faster device
2745  bool clError; // were there any opencl errors
2746  bool valid; // was the correct response generated
2747 } TessDeviceScore;
2748 
2749 /******************************************************************************
2750  * Micro Benchmarks for Device Selection
2751  *****************************************************************************/
2752 
2753 double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2754 
2755  double time = 0;
2756 #if ON_WINDOWS
2757  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2758  QueryPerformanceFrequency(&freq);
2759 #elif ON_APPLE
2760  mach_timebase_info_data_t info = { 0, 0 };
2761  mach_timebase_info(&info);
2762  long long start,stop;
2763 #else
2764  timespec time_funct_start, time_funct_end;
2765 #endif
2766  // input data
2767  l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
2768 
2769  // function call
2770  if (type == DS_DEVICE_OPENCL_DEVICE) {
2771 #if ON_WINDOWS
2772  QueryPerformanceCounter(&time_funct_start);
2773 #elif ON_APPLE
2774  start = mach_absolute_time();
2775 #else
2776  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2777 #endif
2778 
2779  OpenclDevice::gpuEnv = *env;
2780  int wpl = pixGetWpl(input.pix);
2781  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL);
2782 #if ON_WINDOWS
2783  QueryPerformanceCounter(&time_funct_end);
2784  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2785 #elif ON_APPLE
2786  stop = mach_absolute_time();
2787  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
2788 #else
2789  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2790  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2791 #endif
2792 
2793  } else {
2794 #if ON_WINDOWS
2795  QueryPerformanceCounter(&time_funct_start);
2796 #elif ON_APPLE
2797  start = mach_absolute_time();
2798 #else
2799  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2800 #endif
2801  Pix *pix = pixCreate(input.width, input.height, 32);
2802  l_uint32 *pixData = pixGetData(pix);
2803  int wpl = pixGetWpl(pix);
2804  //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
2805  //pixSetData(pix, output_gpu);
2806  int i, j;
2807  int idx = 0;
2808  for (i = 0; i < input.height ; i++) {
2809  for (j = 0; j < input.width; j++) {
2810 
2811  l_uint32 tiffword = tiffdata[i * input.width + j];
2812  l_int32 rval = ((tiffword) & 0xff);
2813  l_int32 gval = (((tiffword) >> 8) & 0xff);
2814  l_int32 bval = (((tiffword) >> 16) & 0xff);
2815  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2816  pixData[idx] = value;
2817  idx++;
2818  }
2819  }
2820 #if ON_WINDOWS
2821  QueryPerformanceCounter(&time_funct_end);
2822  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2823 #elif ON_APPLE
2824  stop = mach_absolute_time();
2825  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
2826 #else
2827  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2828  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2829 #endif
2830  pixDestroy(&pix);
2831  }
2832 
2833 
2834  // cleanup
2835 
2836  return time;
2837 }
2838 
2839 double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2840 
2841  double time;
2842 #if ON_WINDOWS
2843  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2844  QueryPerformanceFrequency(&freq);
2845 #elif ON_APPLE
2846  mach_timebase_info_data_t info = { 0, 0 };
2847  mach_timebase_info(&info);
2848  long long start,stop;
2849 #else
2850  timespec time_funct_start, time_funct_end;
2851 #endif
2852 
2853  unsigned char pixelHi = (unsigned char)255;
2854 
2855  int left = 0;
2856  int top = 0;
2857  int kHistogramSize = 256;
2858  int bytes_per_line = input.width*input.numChannels;
2859  int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
2860  int retVal= 0;
2861  // function call
2862  if (type == DS_DEVICE_OPENCL_DEVICE) {
2863 #if ON_WINDOWS
2864  QueryPerformanceCounter(&time_funct_start);
2865 #elif ON_APPLE
2866  start = mach_absolute_time();
2867 #else
2868  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2869 #endif
2870 
2871  OpenclDevice::gpuEnv = *env;
2872  int wpl = pixGetWpl(input.pix);
2873  retVal= OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
2874 
2875 #if ON_WINDOWS
2876  QueryPerformanceCounter(&time_funct_end);
2877  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2878 #elif ON_APPLE
2879  stop = mach_absolute_time();
2880  if(retVal ==0)
2881  {
2882  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
2883  }
2884  else
2885  {
2886  time= FLT_MAX;
2887  }
2888 #else
2889  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2890  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2891 #endif
2892  } else {
2893 
2894  int *histogram = new int[kHistogramSize];
2895 #if ON_WINDOWS
2896  QueryPerformanceCounter(&time_funct_start);
2897 #elif ON_APPLE
2898  start = mach_absolute_time();
2899 #else
2900  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2901 #endif
2902  for (int ch = 0; ch < input.numChannels; ++ch) {
2903  tesseract::HistogramRect(input.pix, input.numChannels,
2904  left, top, input.width, input.height, histogram);
2905  }
2906 #if ON_WINDOWS
2907  QueryPerformanceCounter(&time_funct_end);
2908  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2909 #elif ON_APPLE
2910  stop = mach_absolute_time();
2911  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
2912 #else
2913  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2914  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2915 #endif
2916  delete[] histogram;
2917  }
2918 
2919  // cleanup
2920  delete[] histogramAllChannels;
2921  return time;
2922 }
2923 
2924 //Reproducing the ThresholdRectToPix native version
2925 void ThresholdRectToPix_Native(const unsigned char* imagedata,
2926  int bytes_per_pixel,
2927  int bytes_per_line,
2928  const int* thresholds,
2929  const int* hi_values,
2930  Pix** pix) {
2931  int top = 0;
2932  int left = 0;
2933  int width = pixGetWidth(*pix);
2934  int height = pixGetHeight(*pix);
2935 
2936  *pix = pixCreate(width, height, 1);
2937  uinT32* pixdata = pixGetData(*pix);
2938  int wpl = pixGetWpl(*pix);
2939  const unsigned char* srcdata = imagedata + top * bytes_per_line +
2940  left * bytes_per_pixel;
2941  for (int y = 0; y < height; ++y) {
2942  const uinT8* linedata = srcdata;
2943  uinT32* pixline = pixdata + y * wpl;
2944  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2945  bool white_result = true;
2946  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2947  if (hi_values[ch] >= 0 &&
2948  (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2949  white_result = false;
2950  break;
2951  }
2952  }
2953  if (white_result)
2954  CLEAR_DATA_BIT(pixline, x);
2955  else
2956  SET_DATA_BIT(pixline, x);
2957  }
2958  srcdata += bytes_per_line;
2959  }
2960 }
2961 
2962 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2963 
2964  double time;
2965  int retVal =0;
2966 #if ON_WINDOWS
2967  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2968  QueryPerformanceFrequency(&freq);
2969 #elif ON_APPLE
2970  mach_timebase_info_data_t info = { 0, 0 };
2971  mach_timebase_info(&info);
2972  long long start,stop;
2973 #else
2974  timespec time_funct_start, time_funct_end;
2975 #endif
2976 
2977  // input data
2978  unsigned char pixelHi = (unsigned char)255;
2979  int* thresholds = new int[4];
2980  thresholds[0] = pixelHi/2;
2981  thresholds[1] = pixelHi/2;
2982  thresholds[2] = pixelHi/2;
2983  thresholds[3] = pixelHi/2;
2984  int *hi_values = new int[4];
2985  thresholds[0] = pixelHi;
2986  thresholds[1] = pixelHi;
2987  thresholds[2] = pixelHi;
2988  thresholds[3] = pixelHi;
2989  //Pix* pix = pixCreate(width, height, 1);
2990  int top = 0;
2991  int left = 0;
2992  int bytes_per_line = input.width*input.numChannels;
2993 
2994  // function call
2995  if (type == DS_DEVICE_OPENCL_DEVICE) {
2996 #if ON_WINDOWS
2997  QueryPerformanceCounter(&time_funct_start);
2998 #elif ON_APPLE
2999  start = mach_absolute_time();
3000 #else
3001  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3002 #endif
3003 
3004  OpenclDevice::gpuEnv = *env;
3005  int wpl = pixGetWpl(input.pix);
3006  retVal= OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
3007 
3008 #if ON_WINDOWS
3009  QueryPerformanceCounter(&time_funct_end);
3010  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3011 #elif ON_APPLE
3012  stop = mach_absolute_time();
3013  if(retVal ==0)
3014  {
3015  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;;
3016  }
3017  else
3018  {
3019  time= FLT_MAX;
3020  }
3021 
3022 #else
3023  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3024  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3025 #endif
3026  } else {
3027 
3028 
3029  tesseract::ImageThresholder thresholder;
3030  thresholder.SetImage( input.pix );
3031 #if ON_WINDOWS
3032  QueryPerformanceCounter(&time_funct_start);
3033 #elif ON_APPLE
3034  start = mach_absolute_time();
3035 #else
3036  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3037 #endif
3038  ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
3039  thresholds, hi_values, &input.pix );
3040 
3041 #if ON_WINDOWS
3042  QueryPerformanceCounter(&time_funct_end);
3043  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3044 #elif ON_APPLE
3045  stop = mach_absolute_time();
3046  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
3047 #else
3048  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3049  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3050 #endif
3051  }
3052 
3053  // cleanup
3054  delete[] thresholds;
3055  delete[] hi_values;
3056  return time;
3057 }
3058 
3059 double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
3060 
3061  double time = 0;
3062 #if ON_WINDOWS
3063  LARGE_INTEGER freq, time_funct_start, time_funct_end;
3064  QueryPerformanceFrequency(&freq);
3065 #elif ON_APPLE
3066  mach_timebase_info_data_t info = { 0, 0 };
3067  mach_timebase_info(&info);
3068  long long start,stop;
3069 #else
3070  timespec time_funct_start, time_funct_end;
3071 #endif
3072 
3073  // input data
3074  int resolution = 300;
3075  int wpl = pixGetWpl(input.pix);
3076  int kThinLineFraction = 20; // tess constant
3077  int kMinLineLengthFraction = 4; // tess constant
3078  int max_line_width = resolution / kThinLineFraction;
3079  int min_line_length = resolution / kMinLineLengthFraction;
3080  int closing_brick = max_line_width / 3;
3081 
3082  // function call
3083  if (type == DS_DEVICE_OPENCL_DEVICE) {
3084 #if ON_WINDOWS
3085  QueryPerformanceCounter(&time_funct_start);
3086 #elif ON_APPLE
3087  start = mach_absolute_time();
3088 #else
3089  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3090 #endif
3091  Pix *src_pix = input.pix;
3092  OpenclDevice::gpuEnv = *env;
3093  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
3094  Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
3095  OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length);
3096 
3097  OpenclDevice::releaseMorphCLBuffers();
3098 
3099 #if ON_WINDOWS
3100  QueryPerformanceCounter(&time_funct_end);
3101  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3102 #elif ON_APPLE
3103  stop = mach_absolute_time();
3104  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
3105 #else
3106  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3107  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3108 #endif
3109  } else {
3110 #if ON_WINDOWS
3111  QueryPerformanceCounter(&time_funct_start);
3112 #elif ON_APPLE
3113  start = mach_absolute_time();
3114 #else
3115  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
3116 #endif
3117 
3118  // native serial code
3119  Pix *src_pix = input.pix;
3120  Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
3121  Pix *pix_solid = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
3122  Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
3123  pixDestroy(&pix_solid);
3124  Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
3125  Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
3126  pixDestroy(&pix_hollow);
3127 
3128 #if ON_WINDOWS
3129  QueryPerformanceCounter(&time_funct_end);
3130  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
3131 #elif ON_APPLE
3132  stop = mach_absolute_time();
3133  time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;
3134 #else
3135  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
3136  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
3137 #endif
3138  }
3139 
3140  return time;
3141 }
3142 
3143 
3144 
3145 /******************************************************************************
3146  * Device Selection
3147  *****************************************************************************/
3148 
3149 #include "stdlib.h"
3150 
3151 
3152 // encode score object as byte string
3153 ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
3154  *serializedScoreSize = sizeof(TessDeviceScore);
3155  *serializedScore = (void *) new unsigned char[*serializedScoreSize];
3156  memcpy(*serializedScore, device->score, *serializedScoreSize);
3157  return DS_SUCCESS;
3158 }
3159 
3160 // parses byte string and stores in score object
3161 ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
3162  // check that serializedScoreSize == sizeof(TessDeviceScore);
3163  device->score = new TessDeviceScore;
3164  memcpy(device->score, serializedScore, serializedScoreSize);
3165  return DS_SUCCESS;
3166 }
3167 
3168 ds_status releaseScore( void* score ) {
3169  delete[] score;
3170  return DS_SUCCESS;
3171 }
3172 
3173 // evaluate devices
3174 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
3175 
3176  // overwrite statuc gpuEnv w/ current device
3177  // so native opencl calls can be used; they use static gpuEnv
3178  printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
3179  GPUEnv *env = NULL;
3180  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
3181  env = new GPUEnv;
3182  //printf("[DS] populating tmp GPUEnv from device\n");
3183  populateGPUEnvFromDevice( env, device->oclDeviceID);
3184  env->mnFileCount = 0; //argc;
3185  env->mnKernelCount = 0UL;
3186  //printf("[DS] compiling kernels for tmp GPUEnv\n");
3187  OpenclDevice::gpuEnv = *env;
3188  OpenclDevice::CompileKernelFile(env, "");
3189  }
3190 
3191  TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
3192 
3193  // pixReadTiff
3194  double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
3195 
3196  // HistogramRect
3197  double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
3198 
3199  // ThresholdRectToPix
3200  double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
3201 
3202  // getLineMasks
3203  double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
3204 
3205 
3206  // weigh times (% of cpu time)
3207  // these weights should be the % execution time that the native cpu code took
3208  float composeRGBPixelWeight = 1.2f;
3209  float histogramRectWeight = 2.4f;
3210  float thresholdRectToPixWeight = 4.5f;
3211  float getLineMasksMorphWeight = 5.0f;
3212 
3213  float weightedTime =
3214  composeRGBPixelWeight * composeRGBPixelTime +
3215  histogramRectWeight * histogramRectTime +
3216  thresholdRectToPixWeight * thresholdRectToPixTime +
3217  getLineMasksMorphWeight * getLineMasksMorphTime
3218  ;
3219  device->score = (void *)new TessDeviceScore;
3220  ((TessDeviceScore *)device->score)->time = weightedTime;
3221 
3222  printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
3223  printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
3224  printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
3225  printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
3226  printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
3227  printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
3228  return DS_SUCCESS;
3229 }
3230 
3231 // initial call to select device
3232 ds_device OpenclDevice::getDeviceSelection( ) {
3233  if (!deviceIsSelected) {
3234 PERF_COUNT_START("getDeviceSelection")
3235  // check if opencl is available at runtime
3236  if( 1 == LoadOpencl() ) {
3237  // opencl is available
3238 //PERF_COUNT_SUB("LoadOpencl")
3239  // setup devices
3240  ds_status status;
3241  ds_profile *profile;
3242  status = initDSProfile( &profile, "v0.1" );
3243 PERF_COUNT_SUB("initDSProfile")
3244  // try reading scores from file
3245  char *fileName = "tesseract_opencl_profile_devices.dat";
3246  status = readProfileFromFile( profile, deserializeScore, fileName);
3247  if (status != DS_SUCCESS) {
3248  // need to run evaluation
3249  printf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
3250 
3251  // create input data
3252  TessScoreEvaluationInputData input;
3253  populateTessScoreEvaluationInputData( &input );
3254 //PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
3255  // perform evaluations
3256  unsigned int numUpdates;
3257  status = profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates );
3258 PERF_COUNT_SUB("profileDevices")
3259  // write scores to file
3260  if ( status == DS_SUCCESS ) {
3261  status = writeProfileToFile( profile, serializeScore, fileName);
3262 PERF_COUNT_SUB("writeProfileToFile")
3263  if ( status == DS_SUCCESS ) {
3264  printf("[DS] Scores written to file (%s).\n", fileName);
3265  } else {
3266  printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName);
3267  }
3268  } else {
3269  printf("[DS] Unable to evaluate performance; scores not written to file.\n");
3270  }
3271  } else {
3272 
3273 PERF_COUNT_SUB("readProfileFromFile")
3274  printf("[DS] Profile read from file (%s).\n", fileName);
3275  }
3276 
3277  // we now have device scores either from file or evaluation
3278  // select fastest using custom Tesseract selection algorithm
3279  float bestTime = FLT_MAX; // begin search with worst possible time
3280  int bestDeviceIdx = -1;
3281  for (int d = 0; d < profile->numDevices; d++) {
3282  ds_device device = profile->devices[d];
3283  TessDeviceScore score = *(TessDeviceScore *)device.score;
3284 
3285  float time = score.time;
3286  printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time);
3287  if (time < bestTime) {
3288  bestTime = time;
3289  bestDeviceIdx = d;
3290  }
3291  }
3292  printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
3293  // cleanup
3294  // TODO: call destructor for profile object?
3295 
3296  bool overrided = false;
3297  char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
3298  if (overrideDeviceStr != NULL) {
3299  int overrideDeviceIdx = atoi(overrideDeviceStr);
3300  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) {
3301  printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx);
3302  bestDeviceIdx = overrideDeviceIdx - 1;
3303  overrided = true;
3304  } else {
3305  printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices);
3306  }
3307  }
3308 
3309  if (overrided) {
3310  printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
3311  }
3312  selectedDevice = profile->devices[bestDeviceIdx];
3313  // cleanup
3314  releaseDSProfile(profile, releaseScore);
3315  } else {
3316  // opencl isn't available at runtime, select native cpu device
3317  printf("[DS] OpenCL runtime not available.\n");
3318  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
3319  selectedDevice.oclDeviceName = "(null)";
3320  selectedDevice.score = NULL;
3321  selectedDevice.oclDeviceID = NULL;
3322  selectedDevice.oclDriverVersion = NULL;
3323  }
3324  deviceIsSelected = true;
3325 PERF_COUNT_SUB("select from Profile")
3327  }
3328 //PERF_COUNT_END
3329  return selectedDevice;
3330 }
3331 
3332 
3333 bool OpenclDevice::selectedDeviceIsOpenCL() {
3334  ds_device device = getDeviceSelection();
3335  return (device.type == DS_DEVICE_OPENCL_DEVICE);
3336 }
3337 
3338 bool OpenclDevice::selectedDeviceIsNativeCPU() {
3339  ds_device device = getDeviceSelection();
3340  return (device.type == DS_DEVICE_NATIVE_CPU);
3341 }
3342 
3343 
3344 
3356 #define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))
3357 
3358 Pix * OpenclDevice::pixConvertRGBToGrayOCL(
3359  Pix *srcPix, // 32-bit source
3360  float rwt,
3361  float gwt,
3362  float bwt )
3363 {
3364 PERF_COUNT_START("pixConvertRGBToGrayOCL")
3365  Pix *dstPix; // 8-bit destination
3366 
3367  if (rwt < 0.0 || gwt < 0.0 || bwt < 0.0) return NULL;
3368 
3369  if (rwt == 0.0 && gwt == 0.0 && bwt == 0.0) {
3370  // magic numbers from leptonica
3371  rwt = 0.3;
3372  gwt = 0.5;
3373  bwt = 0.2;
3374  }
3375  // normalize
3376  float sum = rwt + gwt + bwt;
3377  rwt /= sum;
3378  gwt /= sum;
3379  bwt /= sum;
3380 
3381  // source pix
3382  int w, h;
3383  pixGetDimensions(srcPix, &w, &h, NULL);
3384  //printf("Image is %i x %i\n", w, h);
3385  unsigned int *srcData = pixGetData(srcPix);
3386  int srcWPL = pixGetWpl(srcPix);
3387  int srcSize = srcWPL * h * sizeof(unsigned int);
3388 
3389  // destination pix
3390  if ((dstPix = pixCreate(w, h, 8)) == NULL)
3391  return NULL;
3392  pixCopyResolution(dstPix, srcPix);
3393  unsigned int *dstData = pixGetData(dstPix);
3394  int dstWPL = pixGetWpl(dstPix);
3395  int dstWords = dstWPL * h;
3396  int dstSize = dstWords * sizeof(unsigned int);
3397  //printf("dstSize = %i\n", dstSize);
3398 PERF_COUNT_SUB("pix setup")
3399 
3400  // opencl objects
3401  cl_int clStatus;
3402  KernelEnv kEnv;
3403  SetKernelEnv( &kEnv );
3404 
3405  // source buffer
3406  cl_mem srcBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, srcSize, (void *)srcData, &clStatus );
3407  CHECK_OPENCL( clStatus, "clCreateBuffer srcBuffer");
3408 
3409  // destination buffer
3410  cl_mem dstBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dstSize, (void *)dstData, &clStatus );
3411  CHECK_OPENCL( clStatus, "clCreateBuffer dstBuffer");
3412 
3413  // setup work group size parameters
3414  int block_size = 256;
3415  int numWorkGroups = ((h*w+block_size-1) / block_size );
3416  int numThreads = block_size*numWorkGroups;
3417  size_t local_work_size[] = {static_cast<size_t>(block_size)};
3418  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
3419  //printf("Enqueueing %i threads for %i output pixels\n", numThreads, w*h);
3420 
3421  /* compile kernel */
3422  kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, "kernel_RGBToGray", &clStatus );
3423  CHECK_OPENCL( clStatus, "clCreateKernel kernel_RGBToGray");
3424 
3425 
3426  /* set kernel arguments */
3427  clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&srcBuffer );
3428  CHECK_OPENCL( clStatus, "clSetKernelArg srcBuffer");
3429  clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&dstBuffer );
3430  CHECK_OPENCL( clStatus, "clSetKernelArg dstBuffer");
3431  clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(int), (void *)&srcWPL );
3432  CHECK_OPENCL( clStatus, "clSetKernelArg srcWPL" );
3433  clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(int), (void *)&dstWPL );
3434  CHECK_OPENCL( clStatus, "clSetKernelArg dstWPL" );
3435  clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(int), (void *)&h );
3436  CHECK_OPENCL( clStatus, "clSetKernelArg height" );
3437  clStatus = clSetKernelArg( kEnv.mpkKernel, 5, sizeof(int), (void *)&w );
3438  CHECK_OPENCL( clStatus, "clSetKernelArg width" );
3439  clStatus = clSetKernelArg( kEnv.mpkKernel, 6, sizeof(float), (void *)&rwt );
3440  CHECK_OPENCL( clStatus, "clSetKernelArg rwt" );
3441  clStatus = clSetKernelArg( kEnv.mpkKernel, 7, sizeof(float), (void *)&gwt );
3442  CHECK_OPENCL( clStatus, "clSetKernelArg gwt");
3443  clStatus = clSetKernelArg( kEnv.mpkKernel, 8, sizeof(float), (void *)&bwt );
3444  CHECK_OPENCL( clStatus, "clSetKernelArg bwt");
3445 
3446  /* launch kernel & wait */
3447 PERF_COUNT_SUB("before")
3448  clStatus = clEnqueueNDRangeKernel(
3449  kEnv.mpkCmdQueue,
3450  kEnv.mpkKernel,
3451  1, NULL, global_work_size, local_work_size,
3452  0, NULL, NULL );
3453  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_RGBToGray" );
3454  clFinish( kEnv.mpkCmdQueue );
3455 PERF_COUNT_SUB("kernel")
3456 
3457  /* map results back from gpu */
3458  void *ptr = clEnqueueMapBuffer(kEnv.mpkCmdQueue, dstBuffer, CL_TRUE, CL_MAP_READ, 0, dstSize, 0, NULL, NULL, &clStatus);
3459  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer dstBuffer");
3460  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, dstBuffer, ptr, 0, NULL, NULL);
3461 
3462 #if 0
3463  // validate: compute on cpu
3464  Pix *cpuPix = pixCreate(w, h, 8);
3465  pixCopyResolution(cpuPix, srcPix);
3466  unsigned int *cpuData = pixGetData(cpuPix);
3467  int cpuWPL = pixGetWpl(cpuPix);
3468  unsigned int *cpuLine, *srcLine;
3469  int i, j;
3470  for (i = 0, srcLine = srcData, cpuLine = cpuData; i < h; i++) {
3471  for (j = 0; j < w; j++) {
3472  unsigned int word = *(srcLine + j);
3473  int val = (l_int32)(rwt * ((word >> L_RED_SHIFT) & 0xff) +
3474  gwt * ((word >> L_GREEN_SHIFT) & 0xff) +
3475  bwt * ((word >> L_BLUE_SHIFT) & 0xff) + 0.5);
3476  SET_DATA_BYTE(cpuLine, j, val);
3477  }
3478  srcLine += srcWPL;
3479  cpuLine += cpuWPL;
3480  }
3481 
3482  // validate: compare
3483  printf("converted 32-bit -> 8-bit image\n");
3484  for (int row = 0; row < h; row++) {
3485  for (int col = 0; col < w; col++) {
3486  int idx = row*w + col;
3487  unsigned int srcVal = srcData[idx];
3488  unsigned char cpuVal = ((unsigned char *)cpuData)[idx];
3489  unsigned char oclVal = ((unsigned char *)dstData)[idx];
3490  if (srcVal > 0) {
3491  printf("%4i,%4i: %u, %u, %u\n", row, col, srcVal, cpuVal, oclVal);
3492  }
3493  }
3494  //printf("\n");
3495  }
3496 #endif
3497  // release opencl objects
3498  clReleaseMemObject(srcBuffer);
3499  clReleaseMemObject(dstBuffer);
3500 
3501 
3503  // success
3504  return dstPix;
3505 }
3506 #endif
const int kHistogramSize
Definition: otsuthr.h:27
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:157
cmap_table cmap
unsigned int uinT32
Definition: host.h:103
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:41
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
#define FALSE
Definition: capi.h:29
#define PERF_COUNT_END
#define PERF_COUNT_SUB(SUB)
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:43
#define TRUE
Definition: capi.h:28
#define PERF_COUNT_START(FUNCT_NAME)
unsigned char uinT8
Definition: host.h:99
const char * kernel_src
Definition: oclkernels.h:11