recog.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. /*====================================================================*
  2. - Copyright (C) 2001 Leptonica. All rights reserved.
  3. -
  4. - Redistribution and use in source and binary forms, with or without
  5. - modification, are permitted provided that the following conditions
  6. - are met:
  7. - 1. Redistributions of source code must retain the above copyright
  8. - notice, this list of conditions and the following disclaimer.
  9. - 2. Redistributions in binary form must reproduce the above
  10. - copyright notice, this list of conditions and the following
  11. - disclaimer in the documentation and/or other materials
  12. - provided with the distribution.
  13. -
  14. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  15. - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  16. - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  17. - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
  18. - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19. - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20. - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21. - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22. - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  23. - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. *====================================================================*/
  26. #ifndef LEPTONICA_RECOG_H
  27. #define LEPTONICA_RECOG_H
  28. /*!
  29. * \file recog.h
  30. *
  31. * <pre>
  32. * This is a simple utility for training and recognizing individual
  33. * machine-printed text characters. It is designed to be adapted
  34. * to a particular set of character images; e.g., from a book.
  35. *
  36. * There are two methods of training the recognizer. In the most
  37. * simple, a set of bitmaps has been labeled by some means, such
  38. * a generic OCR program. This is input either one template at a time
  39. * or as a pixa of templates, to a function that creates a recog.
  40. * If in a pixa, the text string label must be embedded in the
  41. * text field of each pix.
  42. *
  43. * If labeled data is not available, we start with a bootstrap
  44. * recognizer (BSR) that has labeled data from a variety of sources.
  45. * These images are scaled, typically to a fixed height, and then
  46. * fed similarly scaled unlabeled images from the source (e.g., book),
  47. * and the BSR attempts to identify them. All images that have
  48. * a high enough correlation score with one of the templates in the
  49. * BSR are emitted in a pixa, which now holds unscaled and labeled
  50. * templates from the source. This is the generator for a book adapted
  51. * recognizer (BAR).
  52. *
  53. * The pixa should always be thought of as the primary structure.
  54. * It is the generator for the recog, because a recog is built
  55. * from a pixa of unscaled images.
  56. *
  57. * New image templates can be added to a recog as long as it is
  58. * in training mode. Once training is finished, to add templates
  59. * it is necessary to extract the generating pixa, add templates
  60. * to that pixa, and make a new recog. Similarly, we do not
  61. * join two recog; instead, we simply join their generating pixa,
  62. * and make a recog from that.
  63. *
  64. * To remove outliers from a pixa of labeled pix, make a recog,
  65. * determine the outliers, and generate a new pixa with the
  66. * outliers removed. The outliers are determined by building
  67. * special templates for each character set that are scaled averages
  68. * of the individual templates. Then a correlation score is found
  69. * between each template and the averaged templates. There are
  70. * two implementations; outliers are determined as either:
  71. * (1) a template having a correlation score with its class average
  72. * that is below a threshold, or
  73. * (2) a template having a correlation score with its class average
  74. * that is smaller than the correlation score with the average
  75. * of another class.
  76. * Outliers are removed from the generating pixa. Scaled averaging
  77. * is only performed for determining outliers and for splitting
  78. * characters; it is never used in a trained recognizer for identifying
  79. * unlabeled samples.
  80. *
  81. * Two methods using averaged templates are provided for splitting
  82. * touching characters:
  83. * (1) greedy matching
  84. * (2) document image decoding (DID)
  85. * The DID method is the default. It is about 5x faster and
  86. * possibly more accurate.
  87. *
  88. * Once a BAR has been made, unlabeled sample images are identified
  89. * by finding the individual template in the BAR with highest
  90. * correlation. The input images and images in the BAR can be
  91. * represented in two ways:
  92. * (1) as scanned, binarized to 1 bpp
  93. * (2) as a width-normalized outline formed by thinning to a
  94. * skeleton and then dilating by a fixed amount.
  95. *
  96. * The recog can be serialized to file and read back. The serialized
  97. * version holds the templates used for correlation (which may have
  98. * been modified by scaling and turning into lines from the unscaled
  99. * templates), plus, for arbitrary character sets, the UTF8
  100. * representation and the lookup table mapping from the character
  101. * representation to index.
  102. *
  103. * Why do we not use averaged templates for recognition?
  104. * Letterforms can take on significantly different shapes (eg.,
  105. * the letters 'a' and 'g'), and it makes no sense to average these.
  106. * The previous version of this utility allowed multiple recognizers
  107. * to exist, but this is an unnecessary complication if recognition
  108. * is done on all samples instead of on averages.
  109. * </pre>
  110. */
  111. #define RECOG_VERSION_NUMBER 2
  112. struct L_Recog {
  113. l_int32 scalew; /*!< scale all examples to this width; */
  114. /*!< use 0 prevent horizontal scaling */
  115. l_int32 scaleh; /*!< scale all examples to this height; */
  116. /*!< use 0 prevent vertical scaling */
  117. l_int32 linew; /*!< use a value > 0 to convert the bitmap */
  118. /*!< to lines of fixed width; 0 to skip */
  119. l_int32 templ_use; /*!< template use: use either the average */
  120. /*!< or all temmplates (L_USE_AVERAGE or */
  121. /*!< L_USE_ALL) */
  122. l_int32 maxarraysize; /*!< initialize container arrays to this */
  123. l_int32 setsize; /*!< size of character set */
  124. l_int32 threshold; /*!< for binarizing if depth > 1 */
  125. l_int32 maxyshift; /*!< vertical jiggle on nominal centroid */
  126. /*!< alignment; typically 0 or 1 */
  127. l_int32 charset_type; /*!< one of L_ARABIC_NUMERALS, etc. */
  128. l_int32 charset_size; /*!< expected number of classes in charset */
  129. l_int32 min_nopad; /*!< min number of samples without padding */
  130. l_int32 num_samples; /*!< number of training samples */
  131. l_int32 minwidth_u; /*!< min width averaged unscaled templates */
  132. l_int32 maxwidth_u; /*!< max width averaged unscaled templates */
  133. l_int32 minheight_u; /*!< min height averaged unscaled templates */
  134. l_int32 maxheight_u; /*!< max height averaged unscaled templates */
  135. l_int32 minwidth; /*!< min width averaged scaled templates */
  136. l_int32 maxwidth; /*!< max width averaged scaled templates */
  137. l_int32 ave_done; /*!< set to 1 when averaged bitmaps are made */
  138. l_int32 train_done; /*!< set to 1 when training is complete or */
  139. /*!< identification has started */
  140. l_float32 max_wh_ratio; /*!< max width/height ratio to split */
  141. l_float32 max_ht_ratio; /*!< max of max/min template height ratio */
  142. l_int32 min_splitw; /*!< min component width kept in splitting */
  143. l_int32 max_splith; /*!< max component height kept in splitting */
  144. struct Sarray *sa_text; /*!< text array for arbitrary char set */
  145. struct L_Dna *dna_tochar; /*!< index-to-char lut for arbitrary charset */
  146. l_int32 *centtab; /*!< table for finding centroids */
  147. l_int32 *sumtab; /*!< table for finding pixel sums */
  148. struct Pixaa *pixaa_u; /*!< all unscaled templates for each class */
  149. struct Ptaa *ptaa_u; /*!< centroids of all unscaled templates */
  150. struct Numaa *naasum_u; /*!< area of all unscaled templates */
  151. struct Pixaa *pixaa; /*!< all (scaled) templates for each class */
  152. struct Ptaa *ptaa; /*!< centroids of all (scaledl) templates */
  153. struct Numaa *naasum; /*!< area of all (scaled) templates */
  154. struct Pixa *pixa_u; /*!< averaged unscaled templates per class */
  155. struct Pta *pta_u; /*!< centroids of unscaled ave. templates */
  156. struct Numa *nasum_u; /*!< area of unscaled averaged templates */
  157. struct Pixa *pixa; /*!< averaged (scaled) templates per class */
  158. struct Pta *pta; /*!< centroids of (scaled) ave. templates */
  159. struct Numa *nasum; /*!< area of (scaled) averaged templates */
  160. struct Pixa *pixa_tr; /*!< all input training images */
  161. struct Pixa *pixadb_ave; /*!< unscaled and scaled averaged bitmaps */
  162. struct Pixa *pixa_id; /*!< input images for identifying */
  163. struct Pix *pixdb_ave; /*!< debug: best match of input against ave. */
  164. struct Pix *pixdb_range; /*!< debug: best matches within range */
  165. struct Pixa *pixadb_boot; /*!< debug: bootstrap training results */
  166. struct Pixa *pixadb_split; /*!< debug: splitting results */
  167. struct L_Bmf *bmf; /*!< bmf fonts */
  168. l_int32 bmf_size; /*!< font size of bmf; default is 6 pt */
  169. struct L_Rdid *did; /*!< temp data used for image decoding */
  170. struct L_Rch *rch; /*!< temp data used for holding best char */
  171. struct L_Rcha *rcha; /*!< temp data used for array of best chars */
  172. };
  173. typedef struct L_Recog L_RECOG;
  174. /*!
  175. * Data returned from correlation matching on a single character
  176. */
  177. struct L_Rch {
  178. l_int32 index; /*!< index of best template */
  179. l_float32 score; /*!< correlation score of best template */
  180. char *text; /*!< character string of best template */
  181. l_int32 sample; /*!< index of best sample (within the best */
  182. /*!< template class, if all samples are used) */
  183. l_int32 xloc; /*!< x-location of template (delx + shiftx) */
  184. l_int32 yloc; /*!< y-location of template (dely + shifty) */
  185. l_int32 width; /*!< width of best template */
  186. };
  187. typedef struct L_Rch L_RCH;
  188. /*!
  189. * Data returned from correlation matching on an array of characters
  190. */
  191. struct L_Rcha {
  192. struct Numa *naindex; /*!< indices of best templates */
  193. struct Numa *nascore; /*!< correlation scores of best templates */
  194. struct Sarray *satext; /*!< character strings of best templates */
  195. struct Numa *nasample; /*!< indices of best samples */
  196. struct Numa *naxloc; /*!< x-locations of templates (delx + shiftx) */
  197. struct Numa *nayloc; /*!< y-locations of templates (dely + shifty) */
  198. struct Numa *nawidth; /*!< widths of best templates */
  199. };
  200. typedef struct L_Rcha L_RCHA;
  201. /*!
  202. * Data used for decoding a line of characters.
  203. */
  204. struct L_Rdid {
  205. struct Pix *pixs; /*!< clone of pix to be decoded */
  206. l_int32 **counta; /*!< count array for each averaged template */
  207. l_int32 **delya; /*!< best y-shift array per average template */
  208. l_int32 narray; /*!< number of averaged templates */
  209. l_int32 size; /*!< size of count array (width of pixs) */
  210. l_int32 *setwidth; /*!< setwidths for each template */
  211. struct Numa *nasum; /*!< pixel count in pixs by column */
  212. struct Numa *namoment; /*!< first moment of pixels in pixs by cols */
  213. l_int32 fullarrays; /*!< 1 if full arrays are made; 0 otherwise */
  214. l_float32 *beta; /*!< channel coeffs for template fg term */
  215. l_float32 *gamma; /*!< channel coeffs for bit-and term */
  216. l_float32 *trellisscore; /*!< score on trellis */
  217. l_int32 *trellistempl; /*!< template on trellis (for backtrack) */
  218. struct Numa *natempl; /*!< indices of best path templates */
  219. struct Numa *naxloc; /*!< x locations of best path templates */
  220. struct Numa *nadely; /*!< y locations of best path templates */
  221. struct Numa *nawidth; /*!< widths of best path templates */
  222. struct Boxa *boxa; /*!< Viterbi result for splitting input pixs */
  223. struct Numa *nascore; /*!< correlation scores: best path templates */
  224. struct Numa *natempl_r; /*!< indices of best rescored templates */
  225. struct Numa *nasample_r; /*!< samples of best scored templates */
  226. struct Numa *naxloc_r; /*!< x locations of best rescoredtemplates */
  227. struct Numa *nadely_r; /*!< y locations of best rescoredtemplates */
  228. struct Numa *nawidth_r; /*!< widths of best rescoredtemplates */
  229. struct Numa *nascore_r; /*!< correlation scores: rescored templates */
  230. };
  231. typedef struct L_Rdid L_RDID;
  232. /*-------------------------------------------------------------------------*
  233. * Flags for describing limited character sets *
  234. *-------------------------------------------------------------------------*/
  235. /*! Flags for describing limited character sets */
  236. enum {
  237. L_UNKNOWN = 0, /*!< character set type is not specified */
  238. L_ARABIC_NUMERALS = 1, /*!< 10 digits */
  239. L_LC_ROMAN_NUMERALS = 2, /*!< 7 lower-case letters (i,v,x,l,c,d,m) */
  240. L_UC_ROMAN_NUMERALS = 3, /*!< 7 upper-case letters (I,V,X,L,C,D,M) */
  241. L_LC_ALPHA = 4, /*!< 26 lower-case letters */
  242. L_UC_ALPHA = 5 /*!< 26 upper-case letters */
  243. };
  244. /*-------------------------------------------------------------------------*
  245. * Flags for selecting between using average and all templates *
  246. *-------------------------------------------------------------------------*/
  247. /*! Flags for selecting average or all templates: recog->templ_use */
  248. enum {
  249. L_USE_ALL_TEMPLATES = 0, /*!< use all templates; default */
  250. L_USE_AVERAGE_TEMPLATES = 1 /*!< use average templates; special cases */
  251. };
  252. #endif /* LEPTONICA_RECOG_H */