recog.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. /*====================================================================*
  2. - Copyright (C) 2001 Leptonica. All rights reserved.
  3. -
  4. - Redistribution and use in source and binary forms, with or without
  5. - modification, are permitted provided that the following conditions
  6. - are met:
  7. - 1. Redistributions of source code must retain the above copyright
  8. - notice, this list of conditions and the following disclaimer.
  9. - 2. Redistributions in binary form must reproduce the above
  10. - copyright notice, this list of conditions and the following
  11. - disclaimer in the documentation and/or other materials
  12. - provided with the distribution.
  13. -
  14. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  15. - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  16. - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  17. - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
  18. - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19. - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20. - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21. - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22. - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  23. - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. *====================================================================*/
  26. #ifndef LEPTONICA_RECOG_H
  27. #define LEPTONICA_RECOG_H
  28. /*
  29. * recog.h
  30. *
  31. * A simple utility for training and recognizing individual
  32. * machine-printed text characters. In an application, one can
  33. * envision using a number of these, one for each trained set.
  34. *
  35. * In training mode, a set of labelled bitmaps is presented, either
  36. * one at a time, or in a directory, or in a pixa. If in a directory,
  37. * or a pixa, the labelling text string must be embedded in the
  38. * text field of the image file.
  39. *
  40. * Any number of recognizers (L_Recog) can be trained and then used
  41. * together in an array (L_Recoga). All these trained structures
  42. * can be serialized to file and read back. The serialized version
  43. * holds all the bitmaps used for training, plus, for arbitrary
  44. * character sets, the UTF8 representation and the lookup table
  45. * mapping from the character representation to index.
  46. *
  47. * There are three levels of "sets" here:
  48. *
  49. * (1) Example set: the examples representing a character that
  50. * were printed in the same way, so that they can be combined
  51. * without scaling to form an "average" template for the character.
  52. * In the recognition phase, we use either this aligned average,
  53. * or the individual bitmaps. All examples in the set are given
  54. * the same character label. Example: the letter 'a' in the
  55. * predominant font in a book.
  56. *
  57. * (2) Character set (represented by L_Recog, a single recognizer):
  58. * The set of different characters, each of which is described
  59. * by (1). Each element of the set has a different character
  60. * label. Example: the digits '0' through '9' that are used for
  61. * page numbering in a book.
  62. *
  63. * (3) Recognizer set (represented by L_Recoga, an array of recogs):
  64. * A set of recognizers, each of which is described by (2).
  65. * In general, we do not want to combine the character sets
  66. * with the same labels within different recognizer sets,
  67. * because the bitmaps can differ in font type, style or size.
  68. * Example 1: the letter 'a' can be printed in two very different
  69. * ways (either with a large loop or with a smaller loop in
  70. * the lower half); both share the same label but need to be
  71. * distinguished so that they are not mixed when averaging.
  72. * Example 2: a recognizer trained for a book may be missing
  73. * some characters, so we need to supplement it with another
  74. * "generic" or "bootstrap" recognizer that has the additional
  75. * characters from a variety of sources. Bootstrap recognizers
  76. * must be run in a mode where all characters are scaled.
  77. *
  78. * In the recognition process, for each component in an input image,
  79. * each recognizer (L_Recog) records the best match (highest
  80. * correlation score). If there is more than one recognizer, these
  81. * results are aggregated to find the best match for each character
  82. * for all the recognizers, and this is stored in L_Recoga.
  83. */
  84. #define RECOG_VERSION_NUMBER 1
  85. struct L_Recoga {
  86. l_int32 n; /* number of recogs */
  87. l_int32 nalloc; /* number of recog ptrs allocated */
  88. struct L_Recog **recog; /* recog ptr array */
  89. struct L_Rcha *rcha; /* stores the array of best chars */
  90. };
  91. typedef struct L_Recoga L_RECOGA;
  92. struct L_Recog {
  93. l_int32 scalew; /* scale all examples to this width; */
  94. /* use 0 prevent horizontal scaling */
  95. l_int32 scaleh; /* scale all examples to this height; */
  96. /* use 0 prevent vertical scaling */
  97. l_int32 templ_type; /* template type: either an average of */
  98. /* examples (L_USE_AVERAGE) or the set */
  99. /* of all examples (L_USE_ALL) */
  100. l_int32 maxarraysize; /* initialize container arrays to this */
  101. l_int32 setsize; /* size of character set */
  102. l_int32 threshold; /* for binarizing if depth > 1 */
  103. l_int32 maxyshift; /* vertical jiggle on nominal centroid */
  104. /* alignment; typically 0 or 1 */
  105. l_float32 asperity_fr; /* +- allowed fractional asperity ratio */
  106. l_int32 charset_type; /* one of L_ARABIC_NUMERALS, etc. */
  107. l_int32 charset_size; /* expected number of classes in charset */
  108. char *bootdir; /* dir with bootstrap pixa charsets */
  109. char *bootpattern; /* file pattern for bootstrap pixa charsets */
  110. char *bootpath; /* path for single bootstrap pixa charset */
  111. l_int32 min_nopad; /* min number of samples without padding */
  112. l_int32 max_afterpad; /* max number of samples after padding */
  113. l_int32 samplenum; /* keep track of number of training samples */
  114. l_int32 minwidth_u; /* min width of averaged unscaled templates */
  115. l_int32 maxwidth_u; /* max width of averaged unscaled templates */
  116. l_int32 minheight_u; /* min height of averaged unscaled templates */
  117. l_int32 maxheight_u; /* max height of averaged unscaled templates */
  118. l_int32 minwidth; /* min width of averaged scaled templates */
  119. l_int32 maxwidth; /* max width of averaged scaled templates */
  120. l_int32 ave_done; /* set to 1 when averaged bitmaps are made */
  121. l_int32 train_done; /* set to 1 when training is complete or */
  122. /* identification has started */
  123. l_int32 min_splitw; /* min component width kept in splitting */
  124. l_int32 min_splith; /* min component height kept in splitting */
  125. l_int32 max_splith; /* max component height kept in splitting */
  126. struct Sarray *sa_text; /* text array for arbitrary char set */
  127. struct L_Dna *dna_tochar; /* index-to-char lut for arbitrary char set */
  128. l_int32 *centtab; /* table for finding centroids */
  129. l_int32 *sumtab; /* table for finding pixel sums */
  130. char *fname; /* serialized filename (if read) */
  131. struct Pixaa *pixaa_u; /* all unscaled bitmaps for each class */
  132. struct Pixa *pixa_u; /* averaged unscaled bitmaps for each class */
  133. struct Ptaa *ptaa_u; /* centroids of all unscaled bitmaps */
  134. struct Pta *pta_u; /* centroids of unscaled averaged bitmaps */
  135. struct Numaa *naasum_u; /* area of all unscaled bitmap examples */
  136. struct Numa *nasum_u; /* area of unscaled averaged bitmaps */
  137. struct Pixaa *pixaa; /* all bitmap examples for each class */
  138. struct Pixa *pixa; /* averaged bitmaps for each class */
  139. struct Ptaa *ptaa; /* centroids of all bitmap examples */
  140. struct Pta *pta; /* centroids of averaged bitmaps */
  141. struct Numaa *naasum; /* area of all bitmap examples */
  142. struct Numa *nasum; /* area of averaged bitmaps */
  143. struct Pixa *pixa_tr; /* input training images */
  144. struct Pixa *pixadb_ave; /* unscaled and scaled averaged bitmaps */
  145. struct Pixa *pixa_id; /* input images for identifying */
  146. struct Pix *pixdb_ave; /* debug: best match of input against ave. */
  147. struct Pix *pixdb_range; /* debug: best matches within range */
  148. struct Pixa *pixadb_boot; /* debug: bootstrap training results */
  149. struct Pixa *pixadb_split; /* debug: splitting results */
  150. char *fontdir; /* directory for bitmapped fonts */
  151. struct L_Bmf *bmf; /* bmf fonts */
  152. l_int32 bmf_size; /* font size of bmf; default is 6 pt */
  153. struct L_Rdid *did; /* temp data used for image decoding */
  154. struct L_Rch *rch; /* temp data used for holding best char */
  155. struct L_Rcha *rcha; /* temp data used for array of best chars */
  156. l_int32 bootrecog; /* 1 if using bootstrap samples; else 0 */
  157. l_int32 index; /* recog index in recoga; -1 if no parent */
  158. struct L_Recoga *parent; /* ptr to parent array; can be null */
  159. };
  160. typedef struct L_Recog L_RECOG;
  161. /*
  162. * Data returned from correlation matching on a single character
  163. */
  164. struct L_Rch {
  165. l_int32 index; /* index of best template */
  166. l_float32 score; /* correlation score of best template */
  167. char *text; /* character string of best template */
  168. l_int32 sample; /* index of best sample (within the best */
  169. /* template class, if all samples are used) */
  170. l_int32 xloc; /* x-location of template (delx + shiftx) */
  171. l_int32 yloc; /* y-location of template (dely + shifty) */
  172. l_int32 width; /* width of best template */
  173. };
  174. typedef struct L_Rch L_RCH;
  175. /*
  176. * Data returned from correlation matching on an array of characters
  177. */
  178. struct L_Rcha {
  179. struct Numa *naindex; /* indices of best templates */
  180. struct Numa *nascore; /* correlation scores of best templates */
  181. struct Sarray *satext; /* character strings of best templates */
  182. struct Numa *nasample; /* indices of best samples */
  183. struct Numa *naxloc; /* x-locations of templates (delx + shiftx) */
  184. struct Numa *nayloc; /* y-locations of templates (dely + shifty) */
  185. struct Numa *nawidth; /* widths of best templates */
  186. };
  187. typedef struct L_Rcha L_RCHA;
  188. /*
  189. * Data used for decoding a line of characters.
  190. */
  191. struct L_Rdid {
  192. struct Pix *pixs; /* clone of pix to be decoded */
  193. l_int32 **counta; /* count array for each averaged template */
  194. l_int32 **delya; /* best y-shift array per averaged template */
  195. l_int32 narray; /* number of averaged templates */
  196. l_int32 size; /* size of count array (width of pixs) */
  197. l_int32 *setwidth; /* setwidths for each template */
  198. struct Numa *nasum; /* pixel count in pixs by column */
  199. struct Numa *namoment; /* first moment of pixels in pixs by column */
  200. l_int32 fullarrays; /* 1 if full arrays are made; 0 otherwise */
  201. l_float32 *beta; /* channel coeffs for template fg term */
  202. l_float32 *gamma; /* channel coeffs for bit-and term */
  203. l_float32 *trellisscore; /* score on trellis */
  204. l_int32 *trellistempl; /* template on trellis (for backtrack) */
  205. struct Numa *natempl; /* indices of best path templates */
  206. struct Numa *naxloc; /* x locations of best path templates */
  207. struct Numa *nadely; /* y locations of best path templates */
  208. struct Numa *nawidth; /* widths of best path templates */
  209. struct Numa *nascore; /* correlation scores: best path templates */
  210. struct Numa *natempl_r; /* indices of best rescored templates */
  211. struct Numa *naxloc_r; /* x locations of best rescoredtemplates */
  212. struct Numa *nadely_r; /* y locations of best rescoredtemplates */
  213. struct Numa *nawidth_r; /* widths of best rescoredtemplates */
  214. struct Numa *nascore_r; /* correlation scores: rescored templates */
  215. };
  216. typedef struct L_Rdid L_RDID;
  217. /*-------------------------------------------------------------------------*
  218. * Flags for selecting processing *
  219. *-------------------------------------------------------------------------*/
  220. enum {
  221. L_SELECT_UNSCALED = 0, /* select the unscaled bitmaps */
  222. L_SELECT_SCALED = 1, /* select the scaled bitmaps */
  223. L_SELECT_BOTH = 2 /* select both unscaled and scaled */
  224. };
  225. /*-------------------------------------------------------------------------*
  226. * Flags for determining what to test against *
  227. *-------------------------------------------------------------------------*/
  228. enum {
  229. L_USE_AVERAGE = 0, /* form template from class average */
  230. L_USE_ALL = 1 /* match against all elements of each class */
  231. };
  232. /*-------------------------------------------------------------------------*
  233. * Flags for describing limited character sets *
  234. *-------------------------------------------------------------------------*/
  235. enum {
  236. L_UNKNOWN = 0, /* character set type is not specified */
  237. L_ARABIC_NUMERALS = 1, /* 10 digits */
  238. L_LC_ROMAN_NUMERALS = 2, /* 7 lower-case letters (i,v,x,l,c,d,m) */
  239. L_UC_ROMAN_NUMERALS = 3, /* 7 upper-case letters (I,V,X,L,C,D,M) */
  240. L_LC_ALPHA = 4, /* 26 lower-case letters */
  241. L_UC_ALPHA = 5 /* 26 upper-case letters */
  242. };
  243. #endif /* LEPTONICA_RECOG_H */