baseapi.h 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: baseapi.h
  3. // Description: Simple API for calling tesseract.
  4. // Author: Ray Smith
  5. //
  6. // (C) Copyright 2006, Google Inc.
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS,
  13. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. // See the License for the specific language governing permissions and
  15. // limitations under the License.
  16. //
  17. ///////////////////////////////////////////////////////////////////////
  18. #ifndef TESSERACT_API_BASEAPI_H_
  19. #define TESSERACT_API_BASEAPI_H_
  20. #include <cstdio>
  21. // To avoid collision with other typenames include the ABSOLUTE MINIMUM
  22. // complexity of includes here. Use forward declarations wherever possible
  23. // and hide includes of complex types in baseapi.cpp.
  24. #include "apitypes.h"
  25. #include "pageiterator.h"
  26. #include "platform.h"
  27. #include "publictypes.h"
  28. #include "resultiterator.h"
  29. #include "serialis.h"
  30. #include "tess_version.h"
  31. #include "tesscallback.h"
  32. #include "thresholder.h"
  33. #include "unichar.h"
  34. template <typename T> class GenericVector;
  35. class PAGE_RES;
  36. class PAGE_RES_IT;
  37. class ParagraphModel;
  38. struct BlamerBundle;
  39. class BLOCK_LIST;
  40. class DENORM;
  41. class MATRIX;
  42. class ROW;
  43. class STRING;
  44. class WERD;
  45. struct Pix;
  46. struct Box;
  47. struct Pixa;
  48. struct Boxa;
  49. class ETEXT_DESC;
  50. struct OSResults;
  51. class TBOX;
  52. class UNICHARSET;
  53. class WERD_CHOICE_LIST;
  54. struct INT_FEATURE_STRUCT;
  55. using INT_FEATURE = INT_FEATURE_STRUCT *;
  56. struct TBLOB;
  57. namespace tesseract {
  58. class Dawg;
  59. class Dict;
  60. class EquationDetect;
  61. class PageIterator;
  62. class LTRResultIterator;
  63. class ResultIterator;
  64. class MutableIterator;
  65. class TessResultRenderer;
  66. class Tesseract;
  67. class Trie;
  68. class Wordrec;
  69. using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, bool) const;
  70. using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, int, const char *, int);
  71. using ParamsModelClassifyFunc = float (Dict::*)(const char *, void *);
  72. using FillLatticeFunc = void (Wordrec::*)(const MATRIX &, const WERD_CHOICE_LIST &, const UNICHARSET &, BlamerBundle *);
  73. typedef TessCallback4<const UNICHARSET &, int, PageIterator *, Pix *>
  74. TruthCallback;
  75. /**
  76. * Base class for all tesseract APIs.
  77. * Specific classes can add ability to work on different inputs or produce
  78. * different outputs.
  79. * This class is mostly an interface layer on top of the Tesseract instance
  80. * class to hide the data types so that users of this class don't have to
  81. * include any other Tesseract headers.
  82. */
  83. class TESS_API TessBaseAPI {
  84. public:
  85. TessBaseAPI();
  86. virtual ~TessBaseAPI();
  87. /**
  88. * Returns the version identifier as a static string. Do not delete.
  89. */
  90. static const char* Version();
  91. /**
  92. * If compiled with OpenCL AND an available OpenCL
  93. * device is deemed faster than serial code, then
  94. * "device" is populated with the cl_device_id
  95. * and returns sizeof(cl_device_id)
  96. * otherwise *device=nullptr and returns 0.
  97. */
  98. static size_t getOpenCLDevice(void **device);
  99. /**
  100. * Writes the thresholded image to stderr as a PBM file on receipt of a
  101. * SIGSEGV, SIGFPE, or SIGBUS signal. (Linux/Unix only).
  102. */
  103. static void CatchSignals();
  104. /**
  105. * Set the name of the input file. Needed for training and
  106. * reading a UNLV zone file, and for searchable PDF output.
  107. */
  108. void SetInputName(const char* name);
  109. /**
  110. * These functions are required for searchable PDF output.
  111. * We need our hands on the input file so that we can include
  112. * it in the PDF without transcoding. If that is not possible,
  113. * we need the original image. Finally, resolution metadata
  114. * is stored in the PDF so we need that as well.
  115. */
  116. const char* GetInputName();
  117. // Takes ownership of the input pix.
  118. void SetInputImage(Pix *pix);
  119. Pix* GetInputImage();
  120. int GetSourceYResolution();
  121. const char* GetDatapath();
  122. /** Set the name of the bonus output files. Needed only for debugging. */
  123. void SetOutputName(const char* name);
  124. /**
  125. * Set the value of an internal "parameter."
  126. * Supply the name of the parameter and the value as a string, just as
  127. * you would in a config file.
  128. * Returns false if the name lookup failed.
  129. * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
  130. * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
  131. * SetVariable may be used before Init, but settings will revert to
  132. * defaults on End().
  133. *
  134. * Note: Must be called after Init(). Only works for non-init variables
  135. * (init variables should be passed to Init()).
  136. */
  137. bool SetVariable(const char* name, const char* value);
  138. bool SetDebugVariable(const char* name, const char* value);
  139. /**
  140. * Returns true if the parameter was found among Tesseract parameters.
  141. * Fills in value with the value of the parameter.
  142. */
  143. bool GetIntVariable(const char *name, int *value) const;
  144. bool GetBoolVariable(const char *name, bool *value) const;
  145. bool GetDoubleVariable(const char *name, double *value) const;
  146. /**
  147. * Returns the pointer to the string that represents the value of the
  148. * parameter if it was found among Tesseract parameters.
  149. */
  150. const char *GetStringVariable(const char *name) const;
  151. /**
  152. * Print Tesseract parameters to the given file.
  153. */
  154. void PrintVariables(FILE *fp) const;
  155. /**
  156. * Get value of named variable as a string, if it exists.
  157. */
  158. bool GetVariableAsString(const char *name, STRING *val);
  159. /**
  160. * Instances are now mostly thread-safe and totally independent,
  161. * but some global parameters remain. Basically it is safe to use multiple
  162. * TessBaseAPIs in different threads in parallel, UNLESS:
  163. * you use SetVariable on some of the Params in classify and textord.
  164. * If you do, then the effect will be to change it for all your instances.
  165. *
  166. * Start tesseract. Returns zero on success and -1 on failure.
  167. * NOTE that the only members that may be called before Init are those
  168. * listed above here in the class definition.
  169. *
  170. * The datapath must be the name of the tessdata directory.
  171. * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
  172. * It is entirely safe (and eventually will be efficient too) to call
  173. * Init multiple times on the same instance to change language, or just
  174. * to reset the classifier.
  175. * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating
  176. * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
  177. * English. Languages may specify internally that they want to be loaded
  178. * with one or more other languages, so the ~ sign is available to override
  179. * that. Eg if hin were set to load eng by default, then hin+~eng would force
  180. * loading only hin. The number of loaded languages is limited only by
  181. * memory, with the caveat that loading additional languages will impact
  182. * both speed and accuracy, as there is more work to do to decide on the
  183. * applicable language, and there is more chance of hallucinating incorrect
  184. * words.
  185. * WARNING: On changing languages, all Tesseract parameters are reset
  186. * back to their default values. (Which may vary between languages.)
  187. * If you have a rare need to set a Variable that controls
  188. * initialization for a second call to Init you should explicitly
  189. * call End() and then use SetVariable before Init. This is only a very
  190. * rare use case, since there are very few uses that require any parameters
  191. * to be set before Init.
  192. *
  193. * If set_only_non_debug_params is true, only params that do not contain
  194. * "debug" in the name will be set.
  195. */
  196. int Init(const char* datapath, const char* language, OcrEngineMode mode,
  197. char **configs, int configs_size,
  198. const GenericVector<STRING> *vars_vec,
  199. const GenericVector<STRING> *vars_values,
  200. bool set_only_non_debug_params);
  201. int Init(const char* datapath, const char* language, OcrEngineMode oem) {
  202. return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
  203. }
  204. int Init(const char* datapath, const char* language) {
  205. return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, false);
  206. }
  207. // In-memory version reads the traineddata file directly from the given
  208. // data[data_size] array, and/or reads data via a FileReader.
  209. int Init(const char* data, int data_size, const char* language,
  210. OcrEngineMode mode, char** configs, int configs_size,
  211. const GenericVector<STRING>* vars_vec,
  212. const GenericVector<STRING>* vars_values,
  213. bool set_only_non_debug_params, FileReader reader);
  214. /**
  215. * Returns the languages string used in the last valid initialization.
  216. * If the last initialization specified "deu+hin" then that will be
  217. * returned. If hin loaded eng automatically as well, then that will
  218. * not be included in this list. To find the languages actually
  219. * loaded use GetLoadedLanguagesAsVector.
  220. * The returned string should NOT be deleted.
  221. */
  222. const char* GetInitLanguagesAsString() const;
  223. /**
  224. * Returns the loaded languages in the vector of STRINGs.
  225. * Includes all languages loaded by the last Init, including those loaded
  226. * as dependencies of other loaded languages.
  227. */
  228. void GetLoadedLanguagesAsVector(GenericVector<STRING>* langs) const;
  229. /**
  230. * Returns the available languages in the sorted vector of STRINGs.
  231. */
  232. void GetAvailableLanguagesAsVector(GenericVector<STRING>* langs) const;
  233. /**
  234. * Init only the lang model component of Tesseract. The only functions
  235. * that work after this init are SetVariable and IsValidWord.
  236. * WARNING: temporary! This function will be removed from here and placed
  237. * in a separate API at some future time.
  238. */
  239. int InitLangMod(const char* datapath, const char* language);
  240. /**
  241. * Init only for page layout analysis. Use only for calls to SetImage and
  242. * AnalysePage. Calls that attempt recognition will generate an error.
  243. */
  244. void InitForAnalysePage();
  245. /**
  246. * Read a "config" file containing a set of param, value pairs.
  247. * Searches the standard places: tessdata/configs, tessdata/tessconfigs
  248. * and also accepts a relative or absolute path name.
  249. * Note: only non-init params will be set (init params are set by Init()).
  250. */
  251. void ReadConfigFile(const char* filename);
  252. /** Same as above, but only set debug params from the given config file. */
  253. void ReadDebugConfigFile(const char* filename);
  254. /**
  255. * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
  256. * The mode is stored as an IntParam so it can also be modified by
  257. * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
  258. */
  259. void SetPageSegMode(PageSegMode mode);
  260. /** Return the current page segmentation mode. */
  261. PageSegMode GetPageSegMode() const;
  262. /**
  263. * Recognize a rectangle from an image and return the result as a string.
  264. * May be called many times for a single Init.
  265. * Currently has no error checking.
  266. * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
  267. * Palette color images will not work properly and must be converted to
  268. * 24 bit.
  269. * Binary images of 1 bit per pixel may also be given but they must be
  270. * byte packed with the MSB of the first byte being the first pixel, and a
  271. * 1 represents WHITE. For binary images set bytes_per_pixel=0.
  272. * The recognized text is returned as a char* which is coded
  273. * as UTF8 and must be freed with the delete [] operator.
  274. *
  275. * Note that TesseractRect is the simplified convenience interface.
  276. * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
  277. * and one or more of the Get*Text functions below.
  278. */
  279. char* TesseractRect(const unsigned char* imagedata,
  280. int bytes_per_pixel, int bytes_per_line,
  281. int left, int top, int width, int height);
  282. /**
  283. * Call between pages or documents etc to free up memory and forget
  284. * adaptive data.
  285. */
  286. void ClearAdaptiveClassifier();
  287. /**
  288. * @defgroup AdvancedAPI Advanced API
  289. * The following methods break TesseractRect into pieces, so you can
  290. * get hold of the thresholded image, get the text in different formats,
  291. * get bounding boxes, confidences etc.
  292. */
  293. /* @{ */
  294. /**
  295. * Provide an image for Tesseract to recognize. Format is as
  296. * TesseractRect above. Copies the image buffer and converts to Pix.
  297. * SetImage clears all recognition results, and sets the rectangle to the
  298. * full image, so it may be followed immediately by a GetUTF8Text, and it
  299. * will automatically perform recognition.
  300. */
  301. void SetImage(const unsigned char* imagedata, int width, int height,
  302. int bytes_per_pixel, int bytes_per_line);
  303. /**
  304. * Provide an image for Tesseract to recognize. As with SetImage above,
  305. * Tesseract takes its own copy of the image, so it need not persist until
  306. * after Recognize.
  307. * Pix vs raw, which to use?
  308. * Use Pix where possible. Tesseract uses Pix as its internal representation
  309. * and it is therefore more efficient to provide a Pix directly.
  310. */
  311. void SetImage(Pix* pix);
  312. /**
  313. * Set the resolution of the source image in pixels per inch so font size
  314. * information can be calculated in results. Call this after SetImage().
  315. */
  316. void SetSourceResolution(int ppi);
  317. /**
  318. * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
  319. * Each SetRectangle clears the recogntion results so multiple rectangles
  320. * can be recognized with the same image.
  321. */
  322. void SetRectangle(int left, int top, int width, int height);
  323. /**
  324. * In extreme cases only, usually with a subclass of Thresholder, it
  325. * is possible to provide a different Thresholder. The Thresholder may
  326. * be preloaded with an image, settings etc, or they may be set after.
  327. * Note that Tesseract takes ownership of the Thresholder and will
  328. * delete it when it it is replaced or the API is destructed.
  329. */
  330. void SetThresholder(ImageThresholder* thresholder) {
  331. delete thresholder_;
  332. thresholder_ = thresholder;
  333. ClearResults();
  334. }
  335. /**
  336. * Get a copy of the internal thresholded image from Tesseract.
  337. * Caller takes ownership of the Pix and must pixDestroy it.
  338. * May be called any time after SetImage, or after TesseractRect.
  339. */
  340. Pix* GetThresholdedImage();
  341. /**
  342. * Get the result of page layout analysis as a leptonica-style
  343. * Boxa, Pixa pair, in reading order.
  344. * Can be called before or after Recognize.
  345. */
  346. Boxa* GetRegions(Pixa** pixa);
  347. /**
  348. * Get the textlines as a leptonica-style
  349. * Boxa, Pixa pair, in reading order.
  350. * Can be called before or after Recognize.
  351. * If raw_image is true, then extract from the original image instead of the
  352. * thresholded image and pad by raw_padding pixels.
  353. * If blockids is not nullptr, the block-id of each line is also returned as an
  354. * array of one element per line. delete [] after use.
  355. * If paraids is not nullptr, the paragraph-id of each line within its block is
  356. * also returned as an array of one element per line. delete [] after use.
  357. */
  358. Boxa* GetTextlines(bool raw_image, int raw_padding,
  359. Pixa** pixa, int** blockids, int** paraids);
  360. /*
  361. Helper method to extract from the thresholded image. (most common usage)
  362. */
  363. Boxa* GetTextlines(Pixa** pixa, int** blockids) {
  364. return GetTextlines(false, 0, pixa, blockids, nullptr);
  365. }
  366. /**
  367. * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
  368. * pair, in reading order. Enables downstream handling of non-rectangular
  369. * regions.
  370. * Can be called before or after Recognize.
  371. * If blockids is not nullptr, the block-id of each line is also returned as an
  372. * array of one element per line. delete [] after use.
  373. */
  374. Boxa* GetStrips(Pixa** pixa, int** blockids);
  375. /**
  376. * Get the words as a leptonica-style
  377. * Boxa, Pixa pair, in reading order.
  378. * Can be called before or after Recognize.
  379. */
  380. Boxa* GetWords(Pixa** pixa);
  381. /**
  382. * Gets the individual connected (text) components (created
  383. * after pages segmentation step, but before recognition)
  384. * as a leptonica-style Boxa, Pixa pair, in reading order.
  385. * Can be called before or after Recognize.
  386. * Note: the caller is responsible for calling boxaDestroy()
  387. * on the returned Boxa array and pixaDestroy() on cc array.
  388. */
  389. Boxa* GetConnectedComponents(Pixa** cc);
  390. /**
  391. * Get the given level kind of components (block, textline, word etc.) as a
  392. * leptonica-style Boxa, Pixa pair, in reading order.
  393. * Can be called before or after Recognize.
  394. * If blockids is not nullptr, the block-id of each component is also returned
  395. * as an array of one element per component. delete [] after use.
  396. * If blockids is not nullptr, the paragraph-id of each component with its block
  397. * is also returned as an array of one element per component. delete [] after
  398. * use.
  399. * If raw_image is true, then portions of the original image are extracted
  400. * instead of the thresholded image and padded with raw_padding.
  401. * If text_only is true, then only text components are returned.
  402. */
  403. Boxa* GetComponentImages(PageIteratorLevel level,
  404. bool text_only, bool raw_image,
  405. int raw_padding,
  406. Pixa** pixa, int** blockids, int** paraids);
  407. // Helper function to get binary images with no padding (most common usage).
  408. Boxa* GetComponentImages(const PageIteratorLevel level,
  409. const bool text_only,
  410. Pixa** pixa, int** blockids) {
  411. return GetComponentImages(level, text_only, false, 0, pixa, blockids, nullptr);
  412. }
  413. /**
  414. * Returns the scale factor of the thresholded image that would be returned by
  415. * GetThresholdedImage() and the various GetX() methods that call
  416. * GetComponentImages().
  417. * Returns 0 if no thresholder has been set.
  418. */
  419. int GetThresholdedImageScaleFactor() const;
  420. /**
  421. * Runs page layout analysis in the mode set by SetPageSegMode.
  422. * May optionally be called prior to Recognize to get access to just
  423. * the page layout results. Returns an iterator to the results.
  424. * If merge_similar_words is true, words are combined where suitable for use
  425. * with a line recognizer. Use if you want to use AnalyseLayout to find the
  426. * textlines, and then want to process textline fragments with an external
  427. * line recognizer.
  428. * Returns nullptr on error or an empty page.
  429. * The returned iterator must be deleted after use.
  430. * WARNING! This class points to data held within the TessBaseAPI class, and
  431. * therefore can only be used while the TessBaseAPI class still exists and
  432. * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
  433. * DetectOS, or anything else that changes the internal PAGE_RES.
  434. */
  435. PageIterator* AnalyseLayout();
  436. PageIterator* AnalyseLayout(bool merge_similar_words);
  437. /**
  438. * Recognize the image from SetAndThresholdImage, generating Tesseract
  439. * internal structures. Returns 0 on success.
  440. * Optional. The Get*Text functions below will call Recognize if needed.
  441. * After Recognize, the output is kept internally until the next SetImage.
  442. */
  443. int Recognize(ETEXT_DESC* monitor);
  444. /**
  445. * Methods to retrieve information after SetAndThresholdImage(),
  446. * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
  447. */
  448. #ifndef DISABLED_LEGACY_ENGINE
  449. /** Variant on Recognize used for testing chopper. */
  450. int RecognizeForChopTest(ETEXT_DESC* monitor);
  451. #endif
  452. /**
  453. * Turns images into symbolic text.
  454. *
  455. * filename can point to a single image, a multi-page TIFF,
  456. * or a plain text list of image filenames.
  457. *
  458. * retry_config is useful for debugging. If not nullptr, you can fall
  459. * back to an alternate configuration if a page fails for some
  460. * reason.
  461. *
  462. * timeout_millisec terminates processing if any single page
  463. * takes too long. Set to 0 for unlimited time.
  464. *
  465. * renderer is responible for creating the output. For example,
  466. * use the TessTextRenderer if you want plaintext output, or
  467. * the TessPDFRender to produce searchable PDF.
  468. *
  469. * If tessedit_page_number is non-negative, will only process that
  470. * single page. Works for multi-page tiff file, or filelist.
  471. *
  472. * Returns true if successful, false on error.
  473. */
  474. bool ProcessPages(const char* filename, const char* retry_config,
  475. int timeout_millisec, TessResultRenderer* renderer);
  476. // Does the real work of ProcessPages.
  477. bool ProcessPagesInternal(const char* filename, const char* retry_config,
  478. int timeout_millisec, TessResultRenderer* renderer);
  479. /**
  480. * Turn a single image into symbolic text.
  481. *
  482. * The pix is the image processed. filename and page_index are
  483. * metadata used by side-effect processes, such as reading a box
  484. * file or formatting as hOCR.
  485. *
  486. * See ProcessPages for desciptions of other parameters.
  487. */
  488. bool ProcessPage(Pix* pix, int page_index, const char* filename,
  489. const char* retry_config, int timeout_millisec,
  490. TessResultRenderer* renderer);
  491. /**
  492. * Get a reading-order iterator to the results of LayoutAnalysis and/or
  493. * Recognize. The returned iterator must be deleted after use.
  494. * WARNING! This class points to data held within the TessBaseAPI class, and
  495. * therefore can only be used while the TessBaseAPI class still exists and
  496. * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
  497. * DetectOS, or anything else that changes the internal PAGE_RES.
  498. */
  499. ResultIterator* GetIterator();
  500. /**
  501. * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
  502. * The returned iterator must be deleted after use.
  503. * WARNING! This class points to data held within the TessBaseAPI class, and
  504. * therefore can only be used while the TessBaseAPI class still exists and
  505. * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
  506. * DetectOS, or anything else that changes the internal PAGE_RES.
  507. */
  508. MutableIterator* GetMutableIterator();
  509. /**
  510. * The recognized text is returned as a char* which is coded
  511. * as UTF8 and must be freed with the delete [] operator.
  512. */
  513. char* GetUTF8Text();
  514. /**
  515. * Make a HTML-formatted string with hOCR markup from the internal
  516. * data structures.
  517. * page_number is 0-based but will appear in the output as 1-based.
  518. * monitor can be used to
  519. * cancel the recognition
  520. * receive progress callbacks
  521. * Returned string must be freed with the delete [] operator.
  522. */
  523. char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
  524. /**
  525. * Make a HTML-formatted string with hOCR markup from the internal
  526. * data structures.
  527. * page_number is 0-based but will appear in the output as 1-based.
  528. * Returned string must be freed with the delete [] operator.
  529. */
  530. char* GetHOCRText(int page_number);
  531. /**
  532. * Make an XML-formatted string with Alto markup from the internal
  533. * data structures.
  534. */
  535. char* GetAltoText(ETEXT_DESC* monitor, int page_number);
  536. /**
  537. * Make an XML-formatted string with Alto markup from the internal
  538. * data structures.
  539. */
  540. char* GetAltoText(int page_number);
  541. /**
  542. * Make a TSV-formatted string from the internal data structures.
  543. * page_number is 0-based but will appear in the output as 1-based.
  544. * Returned string must be freed with the delete [] operator.
  545. */
  546. char* GetTSVText(int page_number);
  547. /**
  548. * Make a box file for LSTM training from the internal data structures.
  549. * Constructs coordinates in the original image - not just the rectangle.
  550. * page_number is a 0-based page index that will appear in the box file.
  551. * Returned string must be freed with the delete [] operator.
  552. */
  553. char* GetLSTMBoxText(int page_number);
  554. /**
  555. * The recognized text is returned as a char* which is coded in the same
  556. * format as a box file used in training.
  557. * Constructs coordinates in the original image - not just the rectangle.
  558. * page_number is a 0-based page index that will appear in the box file.
  559. * Returned string must be freed with the delete [] operator.
  560. */
  561. char* GetBoxText(int page_number);
  562. /**
  563. * The recognized text is returned as a char* which is coded in the same
  564. * format as a WordStr box file used in training.
  565. * page_number is a 0-based page index that will appear in the box file.
  566. * Returned string must be freed with the delete [] operator.
  567. */
  568. char* GetWordStrBoxText(int page_number);
  569. /**
  570. * The recognized text is returned as a char* which is coded
  571. * as UNLV format Latin-1 with specific reject and suspect codes.
  572. * Returned string must be freed with the delete [] operator.
  573. */
  574. char* GetUNLVText();
  575. /**
  576. * Detect the orientation of the input image and apparent script (alphabet).
  577. * orient_deg is the detected clockwise rotation of the input image in degrees
  578. * (0, 90, 180, 270)
  579. * orient_conf is the confidence (15.0 is reasonably confident)
  580. * script_name is an ASCII string, the name of the script, e.g. "Latin"
  581. * script_conf is confidence level in the script
  582. * Returns true on success and writes values to each parameter as an output
  583. */
  584. bool DetectOrientationScript(int* orient_deg, float* orient_conf,
  585. const char** script_name, float* script_conf);
  586. /**
  587. * The recognized text is returned as a char* which is coded
  588. * as UTF8 and must be freed with the delete [] operator.
  589. * page_number is a 0-based page index that will appear in the osd file.
  590. */
  591. char* GetOsdText(int page_number);
  592. /** Returns the (average) confidence value between 0 and 100. */
  593. int MeanTextConf();
  594. /**
  595. * Returns all word confidences (between 0 and 100) in an array, terminated
  596. * by -1. The calling function must delete [] after use.
  597. * The number of confidences should correspond to the number of space-
  598. * delimited words in GetUTF8Text.
  599. */
  600. int* AllWordConfidences();
  601. #ifndef DISABLED_LEGACY_ENGINE
  602. /**
  603. * Applies the given word to the adaptive classifier if possible.
  604. * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
  605. * tell the boundaries of the graphemes.
  606. * Assumes that SetImage/SetRectangle have been used to set the image
  607. * to the given word. The mode arg should be PSM_SINGLE_WORD or
  608. * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
  609. * The currently set PageSegMode is preserved.
  610. * Returns false if adaption was not possible for some reason.
  611. */
  612. bool AdaptToWordStr(PageSegMode mode, const char* wordstr);
  613. #endif // ndef DISABLED_LEGACY_ENGINE
  614. /**
  615. * Free up recognition results and any stored image data, without actually
  616. * freeing any recognition data that would be time-consuming to reload.
  617. * Afterwards, you must call SetImage or TesseractRect before doing
  618. * any Recognize or Get* operation.
  619. */
  620. void Clear();
  621. /**
  622. * Close down tesseract and free up all memory. End() is equivalent to
  623. * destructing and reconstructing your TessBaseAPI.
  624. * Once End() has been used, none of the other API functions may be used
  625. * other than Init and anything declared above it in the class definition.
  626. */
  627. void End();
  628. /**
  629. * Clear any library-level memory caches.
  630. * There are a variety of expensive-to-load constant data structures (mostly
  631. * language dictionaries) that are cached globally -- surviving the Init()
  632. * and End() of individual TessBaseAPI's. This function allows the clearing
  633. * of these caches.
  634. **/
  635. static void ClearPersistentCache();
  636. /**
  637. * Check whether a word is valid according to Tesseract's language model
  638. * @return 0 if the word is invalid, non-zero if valid.
  639. * @warning temporary! This function will be removed from here and placed
  640. * in a separate API at some future time.
  641. */
  642. int IsValidWord(const char *word);
  643. // Returns true if utf8_character is defined in the UniCharset.
  644. bool IsValidCharacter(const char *utf8_character);
  645. bool GetTextDirection(int* out_offset, float* out_slope);
  646. /** Sets Dict::letter_is_okay_ function to point to the given function. */
  647. void SetDictFunc(DictFunc f);
  648. /** Sets Dict::probability_in_context_ function to point to the given
  649. * function.
  650. */
  651. void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
  652. /**
  653. * Estimates the Orientation And Script of the image.
  654. * @return true if the image was processed successfully.
  655. */
  656. bool DetectOS(OSResults*);
  657. /**
  658. * Return text orientation of each block as determined by an earlier run
  659. * of layout analysis.
  660. */
  661. void GetBlockTextOrientations(int** block_orientation,
  662. bool** vertical_writing);
  663. #ifndef DISABLED_LEGACY_ENGINE
  664. /** Sets Wordrec::fill_lattice_ function to point to the given function. */
  665. void SetFillLatticeFunc(FillLatticeFunc f);
  666. /** Find lines from the image making the BLOCK_LIST. */
  667. BLOCK_LIST* FindLinesCreateBlockList();
  668. /**
  669. * Delete a block list.
  670. * This is to keep BLOCK_LIST pointer opaque
  671. * and let go of including the other headers.
  672. */
  673. static void DeleteBlockList(BLOCK_LIST* block_list);
  674. /** Returns a ROW object created from the input row specification. */
  675. static ROW *MakeTessOCRRow(float baseline, float xheight,
  676. float descender, float ascender);
  677. /** Returns a TBLOB corresponding to the entire input image. */
  678. static TBLOB *MakeTBLOB(Pix *pix);
  679. /**
  680. * This method baseline normalizes a TBLOB in-place. The input row is used
  681. * for normalization. The denorm is an optional parameter in which the
  682. * normalization-antidote is returned.
  683. */
  684. static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode);
  685. /** This method returns the features associated with the input image. */
  686. void GetFeaturesForBlob(TBLOB* blob, INT_FEATURE_STRUCT* int_features,
  687. int* num_features, int* feature_outline_index);
  688. /**
  689. * This method returns the row to which a box of specified dimensions would
  690. * belong. If no good match is found, it returns nullptr.
  691. */
  692. static ROW* FindRowForBox(BLOCK_LIST* blocks, int left, int top,
  693. int right, int bottom);
  694. /**
  695. * Method to run adaptive classifier on a blob.
  696. * It returns at max num_max_matches results.
  697. */
  698. void RunAdaptiveClassifier(TBLOB* blob,
  699. int num_max_matches,
  700. int* unichar_ids,
  701. float* ratings,
  702. int* num_matches_returned);
  703. #endif // ndef DISABLED_LEGACY_ENGINE
  704. /** This method returns the string form of the specified unichar. */
  705. const char* GetUnichar(int unichar_id);
  706. /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
  707. const Dawg *GetDawg(int i) const;
  708. /** Return the number of dawgs loaded into tesseract_ object. */
  709. int NumDawgs() const;
  710. Tesseract* tesseract() const { return tesseract_; }
  711. OcrEngineMode oem() const { return last_oem_requested_; }
  712. void InitTruthCallback(TruthCallback *cb) { truth_cb_ = cb; }
  713. void set_min_orientation_margin(double margin);
  714. /* @} */
  715. protected:
  716. /** Common code for setting the image. Returns true if Init has been called. */
  717. TESS_LOCAL bool InternalSetImage();
  718. /**
  719. * Run the thresholder to make the thresholded image. If pix is not nullptr,
  720. * the source is thresholded to pix instead of the internal IMAGE.
  721. */
  722. TESS_LOCAL virtual bool Threshold(Pix** pix);
  723. /**
  724. * Find lines from the image making the BLOCK_LIST.
  725. * @return 0 on success.
  726. */
  727. TESS_LOCAL int FindLines();
  728. /** Delete the pageres and block list ready for a new page. */
  729. void ClearResults();
  730. /**
  731. * Return an LTR Result Iterator -- used only for training, as we really want
  732. * to ignore all BiDi smarts at that point.
  733. * delete once you're done with it.
  734. */
  735. TESS_LOCAL LTRResultIterator* GetLTRIterator();
  736. /**
  737. * Return the length of the output text string, as UTF8, assuming
  738. * one newline per line and one per block, with a terminator,
  739. * and assuming a single character reject marker for each rejected character.
  740. * Also return the number of recognized blobs in blob_count.
  741. */
  742. TESS_LOCAL int TextLength(int* blob_count);
  743. //// paragraphs.cpp ////////////////////////////////////////////////////
  744. TESS_LOCAL void DetectParagraphs(bool after_text_recognition);
  745. #ifndef DISABLED_LEGACY_ENGINE
  746. /** @defgroup ocropusAddOns ocropus add-ons */
  747. /* @{ */
  748. /**
  749. * Adapt to recognize the current image as the given character.
  750. * The image must be preloaded and be just an image of a single character.
  751. */
  752. TESS_LOCAL void AdaptToCharacter(const char *unichar_repr,
  753. int length,
  754. float baseline,
  755. float xheight,
  756. float descender,
  757. float ascender);
  758. /** Recognize text doing one pass only, using settings for a given pass. */
  759. TESS_LOCAL PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list);
  760. TESS_LOCAL PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list,
  761. PAGE_RES* pass1_result);
  762. /**
  763. * Extract the OCR results, costs (penalty points for uncertainty),
  764. * and the bounding boxes of the characters.
  765. */
  766. TESS_LOCAL static int TesseractExtractResult(char** text,
  767. int** lengths,
  768. float** costs,
  769. int** x0,
  770. int** y0,
  771. int** x1,
  772. int** y1,
  773. PAGE_RES* page_res);
  774. TESS_LOCAL const PAGE_RES* GetPageRes() const { return page_res_; }
  775. /* @} */
  776. #endif // ndef DISABLED_LEGACY_ENGINE
  777. protected:
  778. Tesseract* tesseract_; ///< The underlying data object.
  779. Tesseract* osd_tesseract_; ///< For orientation & script detection.
  780. EquationDetect* equ_detect_; ///<The equation detector.
  781. FileReader reader_; ///< Reads files from any filesystem.
  782. ImageThresholder* thresholder_; ///< Image thresholding module.
  783. GenericVector<ParagraphModel *>* paragraph_models_;
  784. BLOCK_LIST* block_list_; ///< The page layout.
  785. PAGE_RES* page_res_; ///< The page-level data.
  786. STRING* input_file_; ///< Name used by training code.
  787. STRING* output_file_; ///< Name used by debug code.
  788. STRING* datapath_; ///< Current location of tessdata.
  789. STRING* language_; ///< Last initialized language.
  790. OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
  791. bool recognition_done_; ///< page_res_ contains recognition data.
  792. TruthCallback *truth_cb_; /// fxn for setting truth_* in WERD_RES
  793. /**
  794. * @defgroup ThresholderParams Thresholder Parameters
  795. * Parameters saved from the Thresholder. Needed to rebuild coordinates.
  796. */
  797. /* @{ */
  798. int rect_left_;
  799. int rect_top_;
  800. int rect_width_;
  801. int rect_height_;
  802. int image_width_;
  803. int image_height_;
  804. /* @} */
  805. private:
  806. // A list of image filenames gets special consideration
  807. bool ProcessPagesFileList(FILE *fp,
  808. STRING *buf,
  809. const char* retry_config, int timeout_millisec,
  810. TessResultRenderer* renderer,
  811. int tessedit_page_number);
  812. // TIFF supports multipage so gets special consideration.
  813. bool ProcessPagesMultipageTiff(const unsigned char *data,
  814. size_t size,
  815. const char* filename,
  816. const char* retry_config,
  817. int timeout_millisec,
  818. TessResultRenderer* renderer,
  819. int tessedit_page_number);
  820. // There's currently no way to pass a document title from the
  821. // Tesseract command line, and we have multiple places that choose
  822. // to set the title to an empty string. Using a single named
  823. // variable will hopefully reduce confusion if the situation changes
  824. // in the future.
  825. const char *unknown_title_ = "";
  826. }; // class TessBaseAPI.
  827. /** Escape a char string - remove &<>"' with HTML codes. */
  828. STRING HOcrEscape(const char* text);
  829. } // namespace tesseract.
  830. #endif // TESSERACT_API_BASEAPI_H_