tesseract 3.04.01

cube/char_bigrams.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        char_bigrams.cpp
00003  * Description: Implementation of a Character Bigrams Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2007
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <algorithm>
00021 #include <math.h>
00022 #include <string>
00023 #include <vector>
00024 
00025 #include "char_bigrams.h"
00026 #include "cube_utils.h"
00027 #include "ndminx.h"
00028 #include "cube_const.h"
00029 
00030 namespace tesseract {
00031 
00032 CharBigrams::CharBigrams() {
00033   memset(&bigram_table_, 0, sizeof(bigram_table_));
00034 }
00035 
00036 CharBigrams::~CharBigrams() {
00037   if (bigram_table_.char_bigram != NULL) {
00038     for (int ch1 = 0; ch1 <= bigram_table_.max_char; ch1++) {
00039       CharBigram *char_bigram = bigram_table_.char_bigram + ch1;
00040 
00041       if (char_bigram->bigram != NULL) {
00042         delete []char_bigram->bigram;
00043       }
00044     }
00045     delete []bigram_table_.char_bigram;
00046   }
00047 }
00048 
00049 CharBigrams *CharBigrams::Create(const string &data_file_path,
00050                                  const string &lang) {
00051   string file_name;
00052   string str;
00053 
00054   file_name = data_file_path + lang;
00055   file_name += ".cube.bigrams";
00056 
00057   // load the string into memory
00058   if (!CubeUtils::ReadFileToString(file_name, &str)) {
00059     return NULL;
00060   }
00061 
00062   // construct a new object
00063   CharBigrams *char_bigrams_obj = new CharBigrams();
00064   if (char_bigrams_obj == NULL) {
00065     fprintf(stderr, "Cube ERROR (CharBigrams::Create): could not create "
00066             "character bigrams object.\n");
00067     return NULL;
00068   }
00069   CharBigramTable *table = &char_bigrams_obj->bigram_table_;
00070 
00071   table->total_cnt = 0;
00072   table->max_char = -1;
00073   table->char_bigram = NULL;
00074 
00075   // split into lines
00076   vector<string> str_vec;
00077   CubeUtils::SplitStringUsing(str, "\r\n", &str_vec);
00078 
00079   for (int big = 0; big < str_vec.size(); big++) {
00080     char_32 ch1;
00081     char_32 ch2;
00082     int cnt;
00083     if (sscanf(str_vec[big].c_str(), "%d %x %x", &cnt, &ch1, &ch2) != 3) {
00084       fprintf(stderr, "Cube ERROR (CharBigrams::Create): invalid format "
00085               "reading line: %s\n", str_vec[big].c_str());
00086       delete char_bigrams_obj;
00087       return NULL;
00088     }
00089 
00090     // expand the bigram table
00091     if (ch1 > table->max_char) {
00092       CharBigram *char_bigram = new CharBigram[ch1 + 1];
00093       if (char_bigram == NULL) {
00094         fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating "
00095                 "additional memory for character bigram table.\n");
00096         return NULL;
00097       }
00098 
00099       if (table->char_bigram != NULL && table->max_char >= 0) {
00100         memcpy(char_bigram, table->char_bigram,
00101           (table->max_char + 1) * sizeof(*char_bigram));
00102 
00103         delete []table->char_bigram;
00104       }
00105       table->char_bigram = char_bigram;
00106 
00107       // init
00108       for (int new_big = table->max_char + 1; new_big <= ch1; new_big++) {
00109         table->char_bigram[new_big].total_cnt = 0;
00110         table->char_bigram[new_big].max_char = -1;
00111         table->char_bigram[new_big].bigram = NULL;
00112       }
00113       table->max_char = ch1;
00114     }
00115 
00116     if (ch2 > table->char_bigram[ch1].max_char) {
00117       Bigram *bigram = new Bigram[ch2 + 1];
00118       if (bigram == NULL) {
00119         fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating "
00120                 "memory for bigram.\n");
00121         delete char_bigrams_obj;
00122         return NULL;
00123       }
00124 
00125       if (table->char_bigram[ch1].bigram != NULL &&
00126           table->char_bigram[ch1].max_char >= 0) {
00127         memcpy(bigram, table->char_bigram[ch1].bigram,
00128           (table->char_bigram[ch1].max_char + 1) * sizeof(*bigram));
00129         delete []table->char_bigram[ch1].bigram;
00130       }
00131       table->char_bigram[ch1].bigram = bigram;
00132 
00133       // init
00134       for (int new_big = table->char_bigram[ch1].max_char + 1;
00135            new_big <= ch2; new_big++) {
00136         table->char_bigram[ch1].bigram[new_big].cnt = 0;
00137       }
00138       table->char_bigram[ch1].max_char = ch2;
00139     }
00140 
00141     table->char_bigram[ch1].bigram[ch2].cnt = cnt;
00142     table->char_bigram[ch1].total_cnt += cnt;
00143     table->total_cnt += cnt;
00144   }
00145 
00146   // compute costs (-log probs)
00147   table->worst_cost = static_cast<int>(
00148       -PROB2COST_SCALE * log(0.5 / table->total_cnt));
00149   for (char_32 ch1 = 0; ch1 <= table->max_char; ch1++) {
00150     for (char_32 ch2 = 0; ch2 <= table->char_bigram[ch1].max_char; ch2++) {
00151       int cnt = table->char_bigram[ch1].bigram[ch2].cnt;
00152       table->char_bigram[ch1].bigram[ch2].cost =
00153           static_cast<int>(-PROB2COST_SCALE *
00154                            log(MAX(0.5, static_cast<double>(cnt)) /
00155                                table->total_cnt));
00156     }
00157   }
00158   return char_bigrams_obj;
00159 }
00160 
00161 int CharBigrams::PairCost(char_32 ch1, char_32 ch2) const {
00162   if (ch1 > bigram_table_.max_char) {
00163     return bigram_table_.worst_cost;
00164   }
00165   if (ch2 > bigram_table_.char_bigram[ch1].max_char) {
00166     return bigram_table_.worst_cost;
00167   }
00168   return bigram_table_.char_bigram[ch1].bigram[ch2].cost;
00169 }
00170 
00171 int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set) const {
00172   if (!char_32_ptr || char_32_ptr[0] == 0) {
00173     return bigram_table_.worst_cost;
00174   }
00175   int cost = MeanCostWithSpaces(char_32_ptr);
00176   if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant &&
00177       CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) {
00178     char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set);
00179     if (lower_32 && lower_32[0] != 0) {
00180       int cost_lower = MeanCostWithSpaces(lower_32);
00181       cost = MIN(cost, cost_lower);
00182       delete [] lower_32;
00183     }
00184     char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set);
00185     if (upper_32 && upper_32[0] != 0) {
00186       int cost_upper = MeanCostWithSpaces(upper_32);
00187       cost = MIN(cost, cost_upper);
00188       delete [] upper_32;
00189     }
00190   }
00191   return cost;
00192 }
00193 
00194 int CharBigrams::MeanCostWithSpaces(const char_32 *char_32_ptr) const {
00195   if (!char_32_ptr)
00196     return bigram_table_.worst_cost;
00197   int len = CubeUtils::StrLen(char_32_ptr);
00198   int cost = 0;
00199   int c = 0;
00200   cost = PairCost(' ', char_32_ptr[0]);
00201   for (c = 1; c < len; c++) {
00202     cost += PairCost(char_32_ptr[c - 1], char_32_ptr[c]);
00203   }
00204   cost += PairCost(char_32_ptr[len - 1], ' ');
00205   return static_cast<int>(cost / static_cast<double>(len + 1));
00206 }
00207 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines