unicharmap.h 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: unicharmap.h
  3. // Description: Unicode character/ligature to integer id class.
  4. // Author: Thomas Kielbus
  5. // Created: Wed Jun 28 17:05:01 PDT 2006
  6. //
  7. // (C) Copyright 2006, Google Inc.
  8. // Licensed under the Apache License, Version 2.0 (the "License");
  9. // you may not use this file except in compliance with the License.
  10. // You may obtain a copy of the License at
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. // Unless required by applicable law or agreed to in writing, software
  13. // distributed under the License is distributed on an "AS IS" BASIS,
  14. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. // See the License for the specific language governing permissions and
  16. // limitations under the License.
  17. //
  18. ///////////////////////////////////////////////////////////////////////
  19. #ifndef TESSERACT_CCUTIL_UNICHARMAP_H_
  20. #define TESSERACT_CCUTIL_UNICHARMAP_H_
  21. #include "unichar.h"
  22. // A UNICHARMAP stores unique unichars. Each of them is associated with one
  23. // UNICHAR_ID.
  24. class UNICHARMAP {
  25. public:
  26. // Create an empty UNICHARMAP
  27. UNICHARMAP();
  28. ~UNICHARMAP();
  29. // Insert the given unichar represention in the UNICHARMAP and associate it
  30. // with the given id. The length of the representation MUST be non-zero.
  31. void insert(const char* const unichar_repr, UNICHAR_ID id);
  32. // Return the id associated with the given unichar representation,
  33. // this representation MUST exist within the UNICHARMAP. The first
  34. // length characters (maximum) from unichar_repr are used. The length
  35. // MUST be non-zero.
  36. UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
  37. // Return true if the given unichar representation is already present in the
  38. // UNICHARMAP. The first length characters (maximum) from unichar_repr are
  39. // used. The length MUST be non-zero.
  40. bool contains(const char* const unichar_repr, int length) const;
  41. // Return the minimum number of characters that must be used from this string
  42. // to obtain a match in the UNICHARMAP.
  43. int minmatch(const char* const unichar_repr) const;
  44. // Clear the UNICHARMAP. All previous data is lost.
  45. void clear();
  46. private:
  47. // The UNICHARMAP is represented as a tree whose nodes are of type
  48. // UNICHARMAP_NODE.
  49. struct UNICHARMAP_NODE {
  50. UNICHARMAP_NODE();
  51. ~UNICHARMAP_NODE();
  52. UNICHARMAP_NODE* children;
  53. UNICHAR_ID id;
  54. };
  55. UNICHARMAP_NODE* nodes;
  56. };
  57. #endif // TESSERACT_CCUTIL_UNICHARMAP_H_