lstm.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: lstm.h
  3. // Description: Long-term-short-term-memory Recurrent neural network.
  4. // Author: Ray Smith
  5. // Created: Wed May 01 17:33:06 PST 2013
  6. //
  7. // (C) Copyright 2013, Google Inc.
  8. // Licensed under the Apache License, Version 2.0 (the "License");
  9. // you may not use this file except in compliance with the License.
  10. // You may obtain a copy of the License at
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. // Unless required by applicable law or agreed to in writing, software
  13. // distributed under the License is distributed on an "AS IS" BASIS,
  14. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. // See the License for the specific language governing permissions and
  16. // limitations under the License.
  17. ///////////////////////////////////////////////////////////////////////
  18. #ifndef TESSERACT_LSTM_LSTM_H_
  19. #define TESSERACT_LSTM_LSTM_H_
  20. #include "network.h"
  21. #include "fullyconnected.h"
  22. namespace tesseract {
  23. // C++ Implementation of the LSTM class from lstm.py.
  24. class LSTM : public Network {
  25. public:
  26. // Enum for the different weights in LSTM, to reduce some of the I/O and
  27. // setup code to loops. The elements of the enum correspond to elements of an
  28. // array of WeightMatrix or a corresponding array of NetworkIO.
  29. enum WeightType {
  30. CI, // Cell Inputs.
  31. GI, // Gate at the input.
  32. GF1, // Forget gate at the memory (1-d or looking back 1 timestep).
  33. GO, // Gate at the output.
  34. GFS, // Forget gate at the memory, looking back in the other dimension.
  35. WT_COUNT // Number of WeightTypes.
  36. };
  37. // Constructor for NT_LSTM (regular 1 or 2-d LSTM), NT_LSTM_SOFTMAX (LSTM with
  38. // additional softmax layer included and fed back into the input at the next
  39. // timestep), or NT_LSTM_SOFTMAX_ENCODED (as LSTM_SOFTMAX, but the feedback
  40. // is binary encoded instead of categorical) only.
  41. // 2-d and bidi softmax LSTMs are not rejected, but are impossible to build
  42. // in the conventional way because the output feedback both forwards and
  43. // backwards in time does become impossible.
  44. LSTM(const STRING& name, int num_inputs, int num_states, int num_outputs,
  45. bool two_dimensional, NetworkType type);
  46. ~LSTM() override;
  47. // Returns the shape output from the network given an input shape (which may
  48. // be partially unknown ie zero).
  49. StaticShape OutputShape(const StaticShape& input_shape) const override;
  50. STRING spec() const override {
  51. STRING spec;
  52. if (type_ == NT_LSTM)
  53. spec.add_str_int("Lfx", ns_);
  54. else if (type_ == NT_LSTM_SUMMARY)
  55. spec.add_str_int("Lfxs", ns_);
  56. else if (type_ == NT_LSTM_SOFTMAX)
  57. spec.add_str_int("LS", ns_);
  58. else if (type_ == NT_LSTM_SOFTMAX_ENCODED)
  59. spec.add_str_int("LE", ns_);
  60. if (softmax_ != nullptr) spec += softmax_->spec();
  61. return spec;
  62. }
  63. // Suspends/Enables training by setting the training_ flag. Serialize and
  64. // DeSerialize only operate on the run-time data if state is false.
  65. void SetEnableTraining(TrainingState state) override;
  66. // Sets up the network for training. Initializes weights using weights of
  67. // scale `range` picked according to the random number generator `randomizer`.
  68. int InitWeights(float range, TRand* randomizer) override;
  69. // Recursively searches the network for softmaxes with old_no outputs,
  70. // and remaps their outputs according to code_map. See network.h for details.
  71. int RemapOutputs(int old_no, const std::vector<int>& code_map) override;
  72. // Converts a float network to an int network.
  73. void ConvertToInt() override;
  74. // Provides debug output on the weights.
  75. void DebugWeights() override;
  76. // Writes to the given file. Returns false in case of error.
  77. bool Serialize(TFile* fp) const override;
  78. // Reads from the given file. Returns false in case of error.
  79. bool DeSerialize(TFile* fp) override;
  80. // Runs forward propagation of activations on the input line.
  81. // See Network for a detailed discussion of the arguments.
  82. void Forward(bool debug, const NetworkIO& input,
  83. const TransposedArray* input_transpose, NetworkScratch* scratch,
  84. NetworkIO* output) override;
  85. // Runs backward propagation of errors on the deltas line.
  86. // See Network for a detailed discussion of the arguments.
  87. bool Backward(bool debug, const NetworkIO& fwd_deltas,
  88. NetworkScratch* scratch, NetworkIO* back_deltas) override;
  89. // Updates the weights using the given learning rate, momentum and adam_beta.
  90. // num_samples is used in the adam computation iff use_adam_ is true.
  91. void Update(float learning_rate, float momentum, float adam_beta,
  92. int num_samples) override;
  93. // Sums the products of weight updates in *this and other, splitting into
  94. // positive (same direction) in *same and negative (different direction) in
  95. // *changed.
  96. void CountAlternators(const Network& other, double* same,
  97. double* changed) const override;
  98. // Prints the weights for debug purposes.
  99. void PrintW();
  100. // Prints the weight deltas for debug purposes.
  101. void PrintDW();
  102. // Returns true of this is a 2-d lstm.
  103. bool Is2D() const {
  104. return is_2d_;
  105. }
  106. private:
  107. // Resizes forward data to cope with an input image of the given width.
  108. void ResizeForward(const NetworkIO& input);
  109. private:
  110. // Size of padded input to weight matrices = ni_ + no_ for 1-D operation
  111. // and ni_ + 2 * no_ for 2-D operation. Note that there is a phantom 1 input
  112. // for the bias that makes the weight matrices of size [na + 1][no].
  113. int32_t na_;
  114. // Number of internal states. Equal to no_ except for a softmax LSTM.
  115. // ns_ is NOT serialized, but is calculated from gate_weights_.
  116. int32_t ns_;
  117. // Number of additional feedback states. The softmax types feed back
  118. // additional output information on top of the ns_ internal states.
  119. // In the case of a binary-coded (EMBEDDED) softmax, nf_ < no_.
  120. int32_t nf_;
  121. // Flag indicating 2-D operation.
  122. bool is_2d_;
  123. // Gate weight arrays of size [na + 1, no].
  124. WeightMatrix gate_weights_[WT_COUNT];
  125. // Used only if this is a softmax LSTM.
  126. FullyConnected* softmax_;
  127. // Input padded with previous output of size [width, na].
  128. NetworkIO source_;
  129. // Internal state used during forward operation, of size [width, ns].
  130. NetworkIO state_;
  131. // State of the 2-d maxpool, generated during forward, used during backward.
  132. GENERIC_2D_ARRAY<int8_t> which_fg_;
  133. // Internal state saved from forward, but used only during backward.
  134. NetworkIO node_values_[WT_COUNT];
  135. // Preserved input stride_map used for Backward when NT_LSTM_SQUASHED.
  136. StrideMap input_map_;
  137. int input_width_;
  138. };
  139. } // namespace tesseract.
  140. #endif // TESSERACT_LSTM_LSTM_H_