| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945 |
- ///////////////////////////////////////////////////////////////////////
- // File: baseapi.h
- // Description: Simple API for calling tesseract.
- // Author: Ray Smith
- //
- // (C) Copyright 2006, Google Inc.
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- // http://www.apache.org/licenses/LICENSE-2.0
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- //
- ///////////////////////////////////////////////////////////////////////
- #ifndef TESSERACT_API_BASEAPI_H_
- #define TESSERACT_API_BASEAPI_H_
- #include <cstdio>
- // To avoid collision with other typenames include the ABSOLUTE MINIMUM
- // complexity of includes here. Use forward declarations wherever possible
- // and hide includes of complex types in baseapi.cpp.
- #include "apitypes.h"
- #include "pageiterator.h"
- #include "platform.h"
- #include "publictypes.h"
- #include "resultiterator.h"
- #include "serialis.h"
- #include "tess_version.h"
- #include "tesscallback.h"
- #include "thresholder.h"
- #include "unichar.h"
- template <typename T> class GenericVector;
- class PAGE_RES;
- class PAGE_RES_IT;
- class ParagraphModel;
- struct BlamerBundle;
- class BLOCK_LIST;
- class DENORM;
- class MATRIX;
- class ROW;
- class STRING;
- class WERD;
- struct Pix;
- struct Box;
- struct Pixa;
- struct Boxa;
- class ETEXT_DESC;
- struct OSResults;
- class TBOX;
- class UNICHARSET;
- class WERD_CHOICE_LIST;
- struct INT_FEATURE_STRUCT;
- using INT_FEATURE = INT_FEATURE_STRUCT *;
- struct TBLOB;
- namespace tesseract {
- class Dawg;
- class Dict;
- class EquationDetect;
- class PageIterator;
- class LTRResultIterator;
- class ResultIterator;
- class MutableIterator;
- class TessResultRenderer;
- class Tesseract;
- class Trie;
- class Wordrec;
- using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, bool) const;
- using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, int, const char *, int);
- using ParamsModelClassifyFunc = float (Dict::*)(const char *, void *);
- using FillLatticeFunc = void (Wordrec::*)(const MATRIX &, const WERD_CHOICE_LIST &, const UNICHARSET &, BlamerBundle *);
- typedef TessCallback4<const UNICHARSET &, int, PageIterator *, Pix *>
- TruthCallback;
- /**
- * Base class for all tesseract APIs.
- * Specific classes can add ability to work on different inputs or produce
- * different outputs.
- * This class is mostly an interface layer on top of the Tesseract instance
- * class to hide the data types so that users of this class don't have to
- * include any other Tesseract headers.
- */
- class TESS_API TessBaseAPI {
- public:
- TessBaseAPI();
- virtual ~TessBaseAPI();
- /**
- * Returns the version identifier as a static string. Do not delete.
- */
- static const char* Version();
- /**
- * If compiled with OpenCL AND an available OpenCL
- * device is deemed faster than serial code, then
- * "device" is populated with the cl_device_id
- * and returns sizeof(cl_device_id)
- * otherwise *device=nullptr and returns 0.
- */
- static size_t getOpenCLDevice(void **device);
- /**
- * Writes the thresholded image to stderr as a PBM file on receipt of a
- * SIGSEGV, SIGFPE, or SIGBUS signal. (Linux/Unix only).
- */
- static void CatchSignals();
- /**
- * Set the name of the input file. Needed for training and
- * reading a UNLV zone file, and for searchable PDF output.
- */
- void SetInputName(const char* name);
- /**
- * These functions are required for searchable PDF output.
- * We need our hands on the input file so that we can include
- * it in the PDF without transcoding. If that is not possible,
- * we need the original image. Finally, resolution metadata
- * is stored in the PDF so we need that as well.
- */
- const char* GetInputName();
- // Takes ownership of the input pix.
- void SetInputImage(Pix *pix);
- Pix* GetInputImage();
- int GetSourceYResolution();
- const char* GetDatapath();
- /** Set the name of the bonus output files. Needed only for debugging. */
- void SetOutputName(const char* name);
- /**
- * Set the value of an internal "parameter."
- * Supply the name of the parameter and the value as a string, just as
- * you would in a config file.
- * Returns false if the name lookup failed.
- * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
- * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
- * SetVariable may be used before Init, but settings will revert to
- * defaults on End().
- *
- * Note: Must be called after Init(). Only works for non-init variables
- * (init variables should be passed to Init()).
- */
- bool SetVariable(const char* name, const char* value);
- bool SetDebugVariable(const char* name, const char* value);
- /**
- * Returns true if the parameter was found among Tesseract parameters.
- * Fills in value with the value of the parameter.
- */
- bool GetIntVariable(const char *name, int *value) const;
- bool GetBoolVariable(const char *name, bool *value) const;
- bool GetDoubleVariable(const char *name, double *value) const;
- /**
- * Returns the pointer to the string that represents the value of the
- * parameter if it was found among Tesseract parameters.
- */
- const char *GetStringVariable(const char *name) const;
- /**
- * Print Tesseract parameters to the given file.
- */
- void PrintVariables(FILE *fp) const;
- /**
- * Get value of named variable as a string, if it exists.
- */
- bool GetVariableAsString(const char *name, STRING *val);
- /**
- * Instances are now mostly thread-safe and totally independent,
- * but some global parameters remain. Basically it is safe to use multiple
- * TessBaseAPIs in different threads in parallel, UNLESS:
- * you use SetVariable on some of the Params in classify and textord.
- * If you do, then the effect will be to change it for all your instances.
- *
- * Start tesseract. Returns zero on success and -1 on failure.
- * NOTE that the only members that may be called before Init are those
- * listed above here in the class definition.
- *
- * The datapath must be the name of the tessdata directory.
- * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
- * It is entirely safe (and eventually will be efficient too) to call
- * Init multiple times on the same instance to change language, or just
- * to reset the classifier.
- * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating
- * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
- * English. Languages may specify internally that they want to be loaded
- * with one or more other languages, so the ~ sign is available to override
- * that. Eg if hin were set to load eng by default, then hin+~eng would force
- * loading only hin. The number of loaded languages is limited only by
- * memory, with the caveat that loading additional languages will impact
- * both speed and accuracy, as there is more work to do to decide on the
- * applicable language, and there is more chance of hallucinating incorrect
- * words.
- * WARNING: On changing languages, all Tesseract parameters are reset
- * back to their default values. (Which may vary between languages.)
- * If you have a rare need to set a Variable that controls
- * initialization for a second call to Init you should explicitly
- * call End() and then use SetVariable before Init. This is only a very
- * rare use case, since there are very few uses that require any parameters
- * to be set before Init.
- *
- * If set_only_non_debug_params is true, only params that do not contain
- * "debug" in the name will be set.
- */
- int Init(const char* datapath, const char* language, OcrEngineMode mode,
- char **configs, int configs_size,
- const GenericVector<STRING> *vars_vec,
- const GenericVector<STRING> *vars_values,
- bool set_only_non_debug_params);
- int Init(const char* datapath, const char* language, OcrEngineMode oem) {
- return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
- }
- int Init(const char* datapath, const char* language) {
- return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, false);
- }
- // In-memory version reads the traineddata file directly from the given
- // data[data_size] array, and/or reads data via a FileReader.
- int Init(const char* data, int data_size, const char* language,
- OcrEngineMode mode, char** configs, int configs_size,
- const GenericVector<STRING>* vars_vec,
- const GenericVector<STRING>* vars_values,
- bool set_only_non_debug_params, FileReader reader);
- /**
- * Returns the languages string used in the last valid initialization.
- * If the last initialization specified "deu+hin" then that will be
- * returned. If hin loaded eng automatically as well, then that will
- * not be included in this list. To find the languages actually
- * loaded use GetLoadedLanguagesAsVector.
- * The returned string should NOT be deleted.
- */
- const char* GetInitLanguagesAsString() const;
- /**
- * Returns the loaded languages in the vector of STRINGs.
- * Includes all languages loaded by the last Init, including those loaded
- * as dependencies of other loaded languages.
- */
- void GetLoadedLanguagesAsVector(GenericVector<STRING>* langs) const;
- /**
- * Returns the available languages in the sorted vector of STRINGs.
- */
- void GetAvailableLanguagesAsVector(GenericVector<STRING>* langs) const;
- /**
- * Init only the lang model component of Tesseract. The only functions
- * that work after this init are SetVariable and IsValidWord.
- * WARNING: temporary! This function will be removed from here and placed
- * in a separate API at some future time.
- */
- int InitLangMod(const char* datapath, const char* language);
- /**
- * Init only for page layout analysis. Use only for calls to SetImage and
- * AnalysePage. Calls that attempt recognition will generate an error.
- */
- void InitForAnalysePage();
- /**
- * Read a "config" file containing a set of param, value pairs.
- * Searches the standard places: tessdata/configs, tessdata/tessconfigs
- * and also accepts a relative or absolute path name.
- * Note: only non-init params will be set (init params are set by Init()).
- */
- void ReadConfigFile(const char* filename);
- /** Same as above, but only set debug params from the given config file. */
- void ReadDebugConfigFile(const char* filename);
- /**
- * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
- * The mode is stored as an IntParam so it can also be modified by
- * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
- */
- void SetPageSegMode(PageSegMode mode);
- /** Return the current page segmentation mode. */
- PageSegMode GetPageSegMode() const;
- /**
- * Recognize a rectangle from an image and return the result as a string.
- * May be called many times for a single Init.
- * Currently has no error checking.
- * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
- * Palette color images will not work properly and must be converted to
- * 24 bit.
- * Binary images of 1 bit per pixel may also be given but they must be
- * byte packed with the MSB of the first byte being the first pixel, and a
- * 1 represents WHITE. For binary images set bytes_per_pixel=0.
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- *
- * Note that TesseractRect is the simplified convenience interface.
- * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
- * and one or more of the Get*Text functions below.
- */
- char* TesseractRect(const unsigned char* imagedata,
- int bytes_per_pixel, int bytes_per_line,
- int left, int top, int width, int height);
- /**
- * Call between pages or documents etc to free up memory and forget
- * adaptive data.
- */
- void ClearAdaptiveClassifier();
- /**
- * @defgroup AdvancedAPI Advanced API
- * The following methods break TesseractRect into pieces, so you can
- * get hold of the thresholded image, get the text in different formats,
- * get bounding boxes, confidences etc.
- */
- /* @{ */
- /**
- * Provide an image for Tesseract to recognize. Format is as
- * TesseractRect above. Copies the image buffer and converts to Pix.
- * SetImage clears all recognition results, and sets the rectangle to the
- * full image, so it may be followed immediately by a GetUTF8Text, and it
- * will automatically perform recognition.
- */
- void SetImage(const unsigned char* imagedata, int width, int height,
- int bytes_per_pixel, int bytes_per_line);
- /**
- * Provide an image for Tesseract to recognize. As with SetImage above,
- * Tesseract takes its own copy of the image, so it need not persist until
- * after Recognize.
- * Pix vs raw, which to use?
- * Use Pix where possible. Tesseract uses Pix as its internal representation
- * and it is therefore more efficient to provide a Pix directly.
- */
- void SetImage(Pix* pix);
- /**
- * Set the resolution of the source image in pixels per inch so font size
- * information can be calculated in results. Call this after SetImage().
- */
- void SetSourceResolution(int ppi);
- /**
- * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
- * Each SetRectangle clears the recogntion results so multiple rectangles
- * can be recognized with the same image.
- */
- void SetRectangle(int left, int top, int width, int height);
- /**
- * In extreme cases only, usually with a subclass of Thresholder, it
- * is possible to provide a different Thresholder. The Thresholder may
- * be preloaded with an image, settings etc, or they may be set after.
- * Note that Tesseract takes ownership of the Thresholder and will
- * delete it when it it is replaced or the API is destructed.
- */
- void SetThresholder(ImageThresholder* thresholder) {
- delete thresholder_;
- thresholder_ = thresholder;
- ClearResults();
- }
- /**
- * Get a copy of the internal thresholded image from Tesseract.
- * Caller takes ownership of the Pix and must pixDestroy it.
- * May be called any time after SetImage, or after TesseractRect.
- */
- Pix* GetThresholdedImage();
- /**
- * Get the result of page layout analysis as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa* GetRegions(Pixa** pixa);
- /**
- * Get the textlines as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If raw_image is true, then extract from the original image instead of the
- * thresholded image and pad by raw_padding pixels.
- * If blockids is not nullptr, the block-id of each line is also returned as an
- * array of one element per line. delete [] after use.
- * If paraids is not nullptr, the paragraph-id of each line within its block is
- * also returned as an array of one element per line. delete [] after use.
- */
- Boxa* GetTextlines(bool raw_image, int raw_padding,
- Pixa** pixa, int** blockids, int** paraids);
- /*
- Helper method to extract from the thresholded image. (most common usage)
- */
- Boxa* GetTextlines(Pixa** pixa, int** blockids) {
- return GetTextlines(false, 0, pixa, blockids, nullptr);
- }
- /**
- * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
- * pair, in reading order. Enables downstream handling of non-rectangular
- * regions.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each line is also returned as an
- * array of one element per line. delete [] after use.
- */
- Boxa* GetStrips(Pixa** pixa, int** blockids);
- /**
- * Get the words as a leptonica-style
- * Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- */
- Boxa* GetWords(Pixa** pixa);
- /**
- * Gets the individual connected (text) components (created
- * after pages segmentation step, but before recognition)
- * as a leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * Note: the caller is responsible for calling boxaDestroy()
- * on the returned Boxa array and pixaDestroy() on cc array.
- */
- Boxa* GetConnectedComponents(Pixa** cc);
- /**
- * Get the given level kind of components (block, textline, word etc.) as a
- * leptonica-style Boxa, Pixa pair, in reading order.
- * Can be called before or after Recognize.
- * If blockids is not nullptr, the block-id of each component is also returned
- * as an array of one element per component. delete [] after use.
- * If blockids is not nullptr, the paragraph-id of each component with its block
- * is also returned as an array of one element per component. delete [] after
- * use.
- * If raw_image is true, then portions of the original image are extracted
- * instead of the thresholded image and padded with raw_padding.
- * If text_only is true, then only text components are returned.
- */
- Boxa* GetComponentImages(PageIteratorLevel level,
- bool text_only, bool raw_image,
- int raw_padding,
- Pixa** pixa, int** blockids, int** paraids);
- // Helper function to get binary images with no padding (most common usage).
- Boxa* GetComponentImages(const PageIteratorLevel level,
- const bool text_only,
- Pixa** pixa, int** blockids) {
- return GetComponentImages(level, text_only, false, 0, pixa, blockids, nullptr);
- }
- /**
- * Returns the scale factor of the thresholded image that would be returned by
- * GetThresholdedImage() and the various GetX() methods that call
- * GetComponentImages().
- * Returns 0 if no thresholder has been set.
- */
- int GetThresholdedImageScaleFactor() const;
- /**
- * Runs page layout analysis in the mode set by SetPageSegMode.
- * May optionally be called prior to Recognize to get access to just
- * the page layout results. Returns an iterator to the results.
- * If merge_similar_words is true, words are combined where suitable for use
- * with a line recognizer. Use if you want to use AnalyseLayout to find the
- * textlines, and then want to process textline fragments with an external
- * line recognizer.
- * Returns nullptr on error or an empty page.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- PageIterator* AnalyseLayout();
- PageIterator* AnalyseLayout(bool merge_similar_words);
- /**
- * Recognize the image from SetAndThresholdImage, generating Tesseract
- * internal structures. Returns 0 on success.
- * Optional. The Get*Text functions below will call Recognize if needed.
- * After Recognize, the output is kept internally until the next SetImage.
- */
- int Recognize(ETEXT_DESC* monitor);
- /**
- * Methods to retrieve information after SetAndThresholdImage(),
- * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
- */
- #ifndef DISABLED_LEGACY_ENGINE
- /** Variant on Recognize used for testing chopper. */
- int RecognizeForChopTest(ETEXT_DESC* monitor);
- #endif
- /**
- * Turns images into symbolic text.
- *
- * filename can point to a single image, a multi-page TIFF,
- * or a plain text list of image filenames.
- *
- * retry_config is useful for debugging. If not nullptr, you can fall
- * back to an alternate configuration if a page fails for some
- * reason.
- *
- * timeout_millisec terminates processing if any single page
- * takes too long. Set to 0 for unlimited time.
- *
- * renderer is responible for creating the output. For example,
- * use the TessTextRenderer if you want plaintext output, or
- * the TessPDFRender to produce searchable PDF.
- *
- * If tessedit_page_number is non-negative, will only process that
- * single page. Works for multi-page tiff file, or filelist.
- *
- * Returns true if successful, false on error.
- */
- bool ProcessPages(const char* filename, const char* retry_config,
- int timeout_millisec, TessResultRenderer* renderer);
- // Does the real work of ProcessPages.
- bool ProcessPagesInternal(const char* filename, const char* retry_config,
- int timeout_millisec, TessResultRenderer* renderer);
- /**
- * Turn a single image into symbolic text.
- *
- * The pix is the image processed. filename and page_index are
- * metadata used by side-effect processes, such as reading a box
- * file or formatting as hOCR.
- *
- * See ProcessPages for desciptions of other parameters.
- */
- bool ProcessPage(Pix* pix, int page_index, const char* filename,
- const char* retry_config, int timeout_millisec,
- TessResultRenderer* renderer);
- /**
- * Get a reading-order iterator to the results of LayoutAnalysis and/or
- * Recognize. The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- ResultIterator* GetIterator();
- /**
- * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
- * The returned iterator must be deleted after use.
- * WARNING! This class points to data held within the TessBaseAPI class, and
- * therefore can only be used while the TessBaseAPI class still exists and
- * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
- * DetectOS, or anything else that changes the internal PAGE_RES.
- */
- MutableIterator* GetMutableIterator();
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- */
- char* GetUTF8Text();
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * monitor can be used to
- * cancel the recognition
- * receive progress callbacks
- * Returned string must be freed with the delete [] operator.
- */
- char* GetHOCRText(ETEXT_DESC* monitor, int page_number);
- /**
- * Make a HTML-formatted string with hOCR markup from the internal
- * data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char* GetHOCRText(int page_number);
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char* GetAltoText(ETEXT_DESC* monitor, int page_number);
- /**
- * Make an XML-formatted string with Alto markup from the internal
- * data structures.
- */
- char* GetAltoText(int page_number);
- /**
- * Make a TSV-formatted string from the internal data structures.
- * page_number is 0-based but will appear in the output as 1-based.
- * Returned string must be freed with the delete [] operator.
- */
- char* GetTSVText(int page_number);
- /**
- * Make a box file for LSTM training from the internal data structures.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char* GetLSTMBoxText(int page_number);
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a box file used in training.
- * Constructs coordinates in the original image - not just the rectangle.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char* GetBoxText(int page_number);
- /**
- * The recognized text is returned as a char* which is coded in the same
- * format as a WordStr box file used in training.
- * page_number is a 0-based page index that will appear in the box file.
- * Returned string must be freed with the delete [] operator.
- */
- char* GetWordStrBoxText(int page_number);
- /**
- * The recognized text is returned as a char* which is coded
- * as UNLV format Latin-1 with specific reject and suspect codes.
- * Returned string must be freed with the delete [] operator.
- */
- char* GetUNLVText();
- /**
- * Detect the orientation of the input image and apparent script (alphabet).
- * orient_deg is the detected clockwise rotation of the input image in degrees
- * (0, 90, 180, 270)
- * orient_conf is the confidence (15.0 is reasonably confident)
- * script_name is an ASCII string, the name of the script, e.g. "Latin"
- * script_conf is confidence level in the script
- * Returns true on success and writes values to each parameter as an output
- */
- bool DetectOrientationScript(int* orient_deg, float* orient_conf,
- const char** script_name, float* script_conf);
- /**
- * The recognized text is returned as a char* which is coded
- * as UTF8 and must be freed with the delete [] operator.
- * page_number is a 0-based page index that will appear in the osd file.
- */
- char* GetOsdText(int page_number);
- /** Returns the (average) confidence value between 0 and 100. */
- int MeanTextConf();
- /**
- * Returns all word confidences (between 0 and 100) in an array, terminated
- * by -1. The calling function must delete [] after use.
- * The number of confidences should correspond to the number of space-
- * delimited words in GetUTF8Text.
- */
- int* AllWordConfidences();
- #ifndef DISABLED_LEGACY_ENGINE
- /**
- * Applies the given word to the adaptive classifier if possible.
- * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
- * tell the boundaries of the graphemes.
- * Assumes that SetImage/SetRectangle have been used to set the image
- * to the given word. The mode arg should be PSM_SINGLE_WORD or
- * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
- * The currently set PageSegMode is preserved.
- * Returns false if adaption was not possible for some reason.
- */
- bool AdaptToWordStr(PageSegMode mode, const char* wordstr);
- #endif // ndef DISABLED_LEGACY_ENGINE
- /**
- * Free up recognition results and any stored image data, without actually
- * freeing any recognition data that would be time-consuming to reload.
- * Afterwards, you must call SetImage or TesseractRect before doing
- * any Recognize or Get* operation.
- */
- void Clear();
- /**
- * Close down tesseract and free up all memory. End() is equivalent to
- * destructing and reconstructing your TessBaseAPI.
- * Once End() has been used, none of the other API functions may be used
- * other than Init and anything declared above it in the class definition.
- */
- void End();
- /**
- * Clear any library-level memory caches.
- * There are a variety of expensive-to-load constant data structures (mostly
- * language dictionaries) that are cached globally -- surviving the Init()
- * and End() of individual TessBaseAPI's. This function allows the clearing
- * of these caches.
- **/
- static void ClearPersistentCache();
- /**
- * Check whether a word is valid according to Tesseract's language model
- * @return 0 if the word is invalid, non-zero if valid.
- * @warning temporary! This function will be removed from here and placed
- * in a separate API at some future time.
- */
- int IsValidWord(const char *word);
- // Returns true if utf8_character is defined in the UniCharset.
- bool IsValidCharacter(const char *utf8_character);
- bool GetTextDirection(int* out_offset, float* out_slope);
- /** Sets Dict::letter_is_okay_ function to point to the given function. */
- void SetDictFunc(DictFunc f);
- /** Sets Dict::probability_in_context_ function to point to the given
- * function.
- */
- void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
- /**
- * Estimates the Orientation And Script of the image.
- * @return true if the image was processed successfully.
- */
- bool DetectOS(OSResults*);
- /**
- * Return text orientation of each block as determined by an earlier run
- * of layout analysis.
- */
- void GetBlockTextOrientations(int** block_orientation,
- bool** vertical_writing);
- #ifndef DISABLED_LEGACY_ENGINE
- /** Sets Wordrec::fill_lattice_ function to point to the given function. */
- void SetFillLatticeFunc(FillLatticeFunc f);
- /** Find lines from the image making the BLOCK_LIST. */
- BLOCK_LIST* FindLinesCreateBlockList();
- /**
- * Delete a block list.
- * This is to keep BLOCK_LIST pointer opaque
- * and let go of including the other headers.
- */
- static void DeleteBlockList(BLOCK_LIST* block_list);
- /** Returns a ROW object created from the input row specification. */
- static ROW *MakeTessOCRRow(float baseline, float xheight,
- float descender, float ascender);
- /** Returns a TBLOB corresponding to the entire input image. */
- static TBLOB *MakeTBLOB(Pix *pix);
- /**
- * This method baseline normalizes a TBLOB in-place. The input row is used
- * for normalization. The denorm is an optional parameter in which the
- * normalization-antidote is returned.
- */
- static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode);
- /** This method returns the features associated with the input image. */
- void GetFeaturesForBlob(TBLOB* blob, INT_FEATURE_STRUCT* int_features,
- int* num_features, int* feature_outline_index);
- /**
- * This method returns the row to which a box of specified dimensions would
- * belong. If no good match is found, it returns nullptr.
- */
- static ROW* FindRowForBox(BLOCK_LIST* blocks, int left, int top,
- int right, int bottom);
- /**
- * Method to run adaptive classifier on a blob.
- * It returns at max num_max_matches results.
- */
- void RunAdaptiveClassifier(TBLOB* blob,
- int num_max_matches,
- int* unichar_ids,
- float* ratings,
- int* num_matches_returned);
- #endif // ndef DISABLED_LEGACY_ENGINE
- /** This method returns the string form of the specified unichar. */
- const char* GetUnichar(int unichar_id);
- /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
- const Dawg *GetDawg(int i) const;
- /** Return the number of dawgs loaded into tesseract_ object. */
- int NumDawgs() const;
- Tesseract* tesseract() const { return tesseract_; }
- OcrEngineMode oem() const { return last_oem_requested_; }
- void InitTruthCallback(TruthCallback *cb) { truth_cb_ = cb; }
- void set_min_orientation_margin(double margin);
- /* @} */
- protected:
- /** Common code for setting the image. Returns true if Init has been called. */
- TESS_LOCAL bool InternalSetImage();
- /**
- * Run the thresholder to make the thresholded image. If pix is not nullptr,
- * the source is thresholded to pix instead of the internal IMAGE.
- */
- TESS_LOCAL virtual bool Threshold(Pix** pix);
- /**
- * Find lines from the image making the BLOCK_LIST.
- * @return 0 on success.
- */
- TESS_LOCAL int FindLines();
- /** Delete the pageres and block list ready for a new page. */
- void ClearResults();
- /**
- * Return an LTR Result Iterator -- used only for training, as we really want
- * to ignore all BiDi smarts at that point.
- * delete once you're done with it.
- */
- TESS_LOCAL LTRResultIterator* GetLTRIterator();
- /**
- * Return the length of the output text string, as UTF8, assuming
- * one newline per line and one per block, with a terminator,
- * and assuming a single character reject marker for each rejected character.
- * Also return the number of recognized blobs in blob_count.
- */
- TESS_LOCAL int TextLength(int* blob_count);
- //// paragraphs.cpp ////////////////////////////////////////////////////
- TESS_LOCAL void DetectParagraphs(bool after_text_recognition);
- #ifndef DISABLED_LEGACY_ENGINE
- /** @defgroup ocropusAddOns ocropus add-ons */
- /* @{ */
- /**
- * Adapt to recognize the current image as the given character.
- * The image must be preloaded and be just an image of a single character.
- */
- TESS_LOCAL void AdaptToCharacter(const char *unichar_repr,
- int length,
- float baseline,
- float xheight,
- float descender,
- float ascender);
- /** Recognize text doing one pass only, using settings for a given pass. */
- TESS_LOCAL PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list);
- TESS_LOCAL PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list,
- PAGE_RES* pass1_result);
- /**
- * Extract the OCR results, costs (penalty points for uncertainty),
- * and the bounding boxes of the characters.
- */
- TESS_LOCAL static int TesseractExtractResult(char** text,
- int** lengths,
- float** costs,
- int** x0,
- int** y0,
- int** x1,
- int** y1,
- PAGE_RES* page_res);
- TESS_LOCAL const PAGE_RES* GetPageRes() const { return page_res_; }
- /* @} */
- #endif // ndef DISABLED_LEGACY_ENGINE
- protected:
- Tesseract* tesseract_; ///< The underlying data object.
- Tesseract* osd_tesseract_; ///< For orientation & script detection.
- EquationDetect* equ_detect_; ///<The equation detector.
- FileReader reader_; ///< Reads files from any filesystem.
- ImageThresholder* thresholder_; ///< Image thresholding module.
- GenericVector<ParagraphModel *>* paragraph_models_;
- BLOCK_LIST* block_list_; ///< The page layout.
- PAGE_RES* page_res_; ///< The page-level data.
- STRING* input_file_; ///< Name used by training code.
- STRING* output_file_; ///< Name used by debug code.
- STRING* datapath_; ///< Current location of tessdata.
- STRING* language_; ///< Last initialized language.
- OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
- bool recognition_done_; ///< page_res_ contains recognition data.
- TruthCallback *truth_cb_; /// fxn for setting truth_* in WERD_RES
- /**
- * @defgroup ThresholderParams Thresholder Parameters
- * Parameters saved from the Thresholder. Needed to rebuild coordinates.
- */
- /* @{ */
- int rect_left_;
- int rect_top_;
- int rect_width_;
- int rect_height_;
- int image_width_;
- int image_height_;
- /* @} */
- private:
- // A list of image filenames gets special consideration
- bool ProcessPagesFileList(FILE *fp,
- STRING *buf,
- const char* retry_config, int timeout_millisec,
- TessResultRenderer* renderer,
- int tessedit_page_number);
- // TIFF supports multipage so gets special consideration.
- bool ProcessPagesMultipageTiff(const unsigned char *data,
- size_t size,
- const char* filename,
- const char* retry_config,
- int timeout_millisec,
- TessResultRenderer* renderer,
- int tessedit_page_number);
- // There's currently no way to pass a document title from the
- // Tesseract command line, and we have multiple places that choose
- // to set the title to an empty string. Using a single named
- // variable will hopefully reduce confusion if the situation changes
- // in the future.
- const char *unknown_title_ = "";
- }; // class TessBaseAPI.
- /** Escape a char string - remove &<>"' with HTML codes. */
- STRING HOcrEscape(const char* text);
- } // namespace tesseract.
- #endif // TESSERACT_API_BASEAPI_H_
|