|
tesseract 3.04.01
|
00001 // Include automatically generated configuration file if running autoconf. 00002 #ifdef HAVE_CONFIG_H 00003 #include "config_auto.h" 00004 #endif 00005 00006 #include <string.h> 00007 #include "baseapi.h" 00008 #include "genericvector.h" 00009 #include "renderer.h" 00010 00011 namespace tesseract { 00012 00013 /********************************************************************** 00014 * Base Renderer interface implementation 00015 **********************************************************************/ 00016 TessResultRenderer::TessResultRenderer(const char *outputbase, 00017 const char* extension) 00018 : file_extension_(extension), 00019 title_(""), imagenum_(-1), 00020 fout_(stdout), 00021 next_(NULL), 00022 happy_(true) { 00023 if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) { 00024 STRING outfile = STRING(outputbase) + STRING(".") + STRING(file_extension_); 00025 fout_ = fopen(outfile.string(), "wb"); 00026 if (fout_ == NULL) { 00027 happy_ = false; 00028 } 00029 } 00030 } 00031 00032 TessResultRenderer::~TessResultRenderer() { 00033 if (fout_ != stdout) 00034 fclose(fout_); 00035 else 00036 clearerr(fout_); 00037 delete next_; 00038 } 00039 00040 void TessResultRenderer::insert(TessResultRenderer* next) { 00041 if (next == NULL) return; 00042 00043 TessResultRenderer* remainder = next_; 00044 next_ = next; 00045 if (remainder) { 00046 while (next->next_ != NULL) { 00047 next = next->next_; 00048 } 00049 next->next_ = remainder; 00050 } 00051 } 00052 00053 bool TessResultRenderer::BeginDocument(const char* title) { 00054 if (!happy_) return false; 00055 title_ = title; 00056 imagenum_ = -1; 00057 bool ok = BeginDocumentHandler(); 00058 if (next_) { 00059 ok = next_->BeginDocument(title) && ok; 00060 } 00061 return ok; 00062 } 00063 00064 bool TessResultRenderer::AddImage(TessBaseAPI* api) { 00065 if (!happy_) return false; 00066 ++imagenum_; 00067 bool ok = AddImageHandler(api); 00068 if (next_) { 00069 ok = next_->AddImage(api) && ok; 00070 } 00071 return ok; 00072 } 00073 00074 bool TessResultRenderer::EndDocument() { 00075 if (!happy_) return false; 00076 bool ok = EndDocumentHandler(); 00077 if (next_) { 00078 ok = next_->EndDocument() && ok; 00079 } 00080 return ok; 00081 } 00082 00083 void TessResultRenderer::AppendString(const char* s) { 00084 AppendData(s, strlen(s)); 00085 } 00086 00087 void TessResultRenderer::AppendData(const char* s, int len) { 00088 int n = fwrite(s, 1, len, fout_); 00089 if (n != len) happy_ = false; 00090 } 00091 00092 bool TessResultRenderer::BeginDocumentHandler() { 00093 return happy_; 00094 } 00095 00096 bool TessResultRenderer::EndDocumentHandler() { 00097 return happy_; 00098 } 00099 00100 00101 /********************************************************************** 00102 * UTF8 Text Renderer interface implementation 00103 **********************************************************************/ 00104 TessTextRenderer::TessTextRenderer(const char *outputbase) 00105 : TessResultRenderer(outputbase, "txt") { 00106 } 00107 00108 bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) { 00109 char* utf8 = api->GetUTF8Text(); 00110 if (utf8 == NULL) { 00111 return false; 00112 } 00113 00114 AppendString(utf8); 00115 delete[] utf8; 00116 00117 bool pageBreak = false; 00118 api->GetBoolVariable("include_page_breaks", &pageBreak); 00119 const char* pageSeparator = api->GetStringVariable("page_separator"); 00120 if (pageBreak) { 00121 AppendString(pageSeparator); 00122 } 00123 00124 return true; 00125 } 00126 00127 /********************************************************************** 00128 * HOcr Text Renderer interface implementation 00129 **********************************************************************/ 00130 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase) 00131 : TessResultRenderer(outputbase, "hocr") { 00132 font_info_ = false; 00133 } 00134 00135 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) 00136 : TessResultRenderer(outputbase, "hocr") { 00137 font_info_ = font_info; 00138 } 00139 00140 bool TessHOcrRenderer::BeginDocumentHandler() { 00141 AppendString( 00142 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" 00143 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" 00144 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" 00145 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" " 00146 "lang=\"en\">\n <head>\n <title>"); 00147 AppendString(title()); 00148 AppendString( 00149 "</title>\n" 00150 "<meta http-equiv=\"Content-Type\" content=\"text/html;" 00151 "charset=utf-8\" />\n" 00152 " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR 00153 "' />\n" 00154 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par" 00155 " ocr_line ocrx_word"); 00156 if (font_info_) 00157 AppendString( 00158 " ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf"); 00159 AppendString( 00160 "'/>\n" 00161 "</head>\n<body>\n"); 00162 00163 return true; 00164 } 00165 00166 bool TessHOcrRenderer::EndDocumentHandler() { 00167 AppendString(" </body>\n</html>\n"); 00168 00169 return true; 00170 } 00171 00172 bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) { 00173 char* hocr = api->GetHOCRText(imagenum()); 00174 if (hocr == NULL) return false; 00175 00176 AppendString(hocr); 00177 delete[] hocr; 00178 00179 return true; 00180 } 00181 00182 /********************************************************************** 00183 * UNLV Text Renderer interface implementation 00184 **********************************************************************/ 00185 TessUnlvRenderer::TessUnlvRenderer(const char *outputbase) 00186 : TessResultRenderer(outputbase, "unlv") { 00187 } 00188 00189 bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) { 00190 char* unlv = api->GetUNLVText(); 00191 if (unlv == NULL) return false; 00192 00193 AppendString(unlv); 00194 delete[] unlv; 00195 00196 return true; 00197 } 00198 00199 /********************************************************************** 00200 * BoxText Renderer interface implementation 00201 **********************************************************************/ 00202 TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase) 00203 : TessResultRenderer(outputbase, "box") { 00204 } 00205 00206 bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) { 00207 char* text = api->GetBoxText(imagenum()); 00208 if (text == NULL) return false; 00209 00210 AppendString(text); 00211 delete[] text; 00212 00213 return true; 00214 } 00215 00216 /********************************************************************** 00217 * Osd Text Renderer interface implementation 00218 **********************************************************************/ 00219 TessOsdRenderer::TessOsdRenderer(const char* outputbase) 00220 : TessResultRenderer(outputbase, "osd") { 00221 } 00222 00223 bool TessOsdRenderer::AddImageHandler(TessBaseAPI* api) { 00224 char* osd = api->GetOsdText(imagenum()); 00225 if (osd == NULL) return false; 00226 00227 AppendString(osd); 00228 delete[] osd; 00229 00230 return true; 00231 } 00232 00233 } // namespace tesseract