networkio.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. ///////////////////////////////////////////////////////////////////////
  2. // File: networkio.h
  3. // Description: Network input/output data, allowing float/int implementations.
  4. // Author: Ray Smith
  5. // Created: Tue Jun 17 08:43:11 PST 2014
  6. //
  7. // (C) Copyright 2014, Google Inc.
  8. // Licensed under the Apache License, Version 2.0 (the "License");
  9. // you may not use this file except in compliance with the License.
  10. // You may obtain a copy of the License at
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. // Unless required by applicable law or agreed to in writing, software
  13. // distributed under the License is distributed on an "AS IS" BASIS,
  14. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. // See the License for the specific language governing permissions and
  16. // limitations under the License.
  17. ///////////////////////////////////////////////////////////////////////
  18. #ifndef TESSERACT_LSTM_NETWORKIO_H_
  19. #define TESSERACT_LSTM_NETWORKIO_H_
  20. #include <cmath>
  21. #include <cstdio>
  22. #include <vector>
  23. #include "genericvector.h"
  24. #include "helpers.h"
  25. #include "static_shape.h"
  26. #include "stridemap.h"
  27. #include "weightmatrix.h"
  28. struct Pix;
  29. namespace tesseract {
  30. // Class to contain all the input/output of a network, allowing for fixed or
  31. // variable-strided 2d to 1d mapping, and float or int8_t values. Provides
  32. // enough calculating functions to hide the detail of the implementation.
  33. class NetworkIO {
  34. public:
  35. NetworkIO() : int_mode_(false) {}
  36. // Resizes the array (and stride), avoiding realloc if possible, to the given
  37. // size from various size specs:
  38. // Same stride size, but given number of features.
  39. void Resize(const NetworkIO& src, int num_features) {
  40. ResizeToMap(src.int_mode(), src.stride_map(), num_features);
  41. }
  42. // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim.
  43. void Resize2d(bool int_mode, int width, int num_features);
  44. // Resizes forcing a float representation with the stridemap of src and the
  45. // given number of features.
  46. void ResizeFloat(const NetworkIO& src, int num_features) {
  47. ResizeToMap(false, src.stride_map(), num_features);
  48. }
  49. // Resizes to a specific stride_map.
  50. void ResizeToMap(bool int_mode, const StrideMap& stride_map,
  51. int num_features);
  52. // Shrinks image size by x_scale,y_scale, and use given number of features.
  53. void ResizeScaled(const NetworkIO& src, int x_scale, int y_scale,
  54. int num_features);
  55. // Resizes to just 1 x-coord, whatever the input.
  56. void ResizeXTo1(const NetworkIO& src, int num_features);
  57. // Initialize all the array to zero.
  58. void Zero();
  59. // Initializes to zero all elements of the array that do not correspond to
  60. // valid image positions. (If a batch of different-sized images are packed
  61. // together, then there will be padding pixels.)
  62. void ZeroInvalidElements();
  63. // Sets up the array from the given image, using the currently set int_mode_.
  64. // If the image width doesn't match the shape, the image is truncated or
  65. // padded with noise to match.
  66. void FromPix(const StaticShape& shape, const Pix* pix, TRand* randomizer);
  67. // Sets up the array from the given set of images, using the currently set
  68. // int_mode_. If the image width doesn't match the shape, the images are
  69. // truncated or padded with noise to match.
  70. void FromPixes(const StaticShape& shape, const std::vector<const Pix*>& pixes,
  71. TRand* randomizer);
  72. // Copies the given pix to *this at the given batch index, stretching and
  73. // clipping the pixel values so that [black, black + 2*contrast] maps to the
  74. // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int.
  75. // This is a 2-d operation in the sense that the output depth is the number
  76. // of input channels, the height is the height of the image, and the width
  77. // is the width of the image, or truncated/padded with noise if the width
  78. // is a fixed size.
  79. void Copy2DImage(int batch, Pix* pix, float black, float contrast,
  80. TRand* randomizer);
  81. // Copies the given pix to *this at the given batch index, as Copy2DImage
  82. // above, except that the output depth is the height of the input image, the
  83. // output height is 1, and the output width as for Copy2DImage.
  84. // The image is thus treated as a 1-d set of vertical pixel strips.
  85. void Copy1DGreyImage(int batch, Pix* pix, float black, float contrast,
  86. TRand* randomizer);
  87. // Helper stores the pixel value in i_ or f_ according to int_mode_.
  88. // t: is the index from the StrideMap corresponding to the current
  89. // [batch,y,x] position
  90. // f: is the index into the depth/channel
  91. // pixel: the value of the pixel from the image (in one channel)
  92. // black: the pixel value to map to the lowest of the range of *this
  93. // contrast: the range of pixel values to stretch to half the range of *this.
  94. void SetPixel(int t, int f, int pixel, float black, float contrast);
  95. // Converts the array to a Pix. Must be pixDestroyed after use.
  96. Pix* ToPix() const;
  97. // Prints the first and last num timesteps of the array for each feature.
  98. void Print(int num) const;
  99. // Returns the timestep width.
  100. int Width() const {
  101. return int_mode_ ? i_.dim1() : f_.dim1();
  102. }
  103. // Returns the number of features.
  104. int NumFeatures() const {
  105. return int_mode_ ? i_.dim2() : f_.dim2();
  106. }
  107. // Accessor to a timestep of the float matrix.
  108. float* f(int t) {
  109. ASSERT_HOST(!int_mode_);
  110. return f_[t];
  111. }
  112. const float* f(int t) const {
  113. ASSERT_HOST(!int_mode_);
  114. return f_[t];
  115. }
  116. const int8_t* i(int t) const {
  117. ASSERT_HOST(int_mode_);
  118. return i_[t];
  119. }
  120. bool int_mode() const {
  121. return int_mode_;
  122. }
  123. void set_int_mode(bool is_quantized) {
  124. int_mode_ = is_quantized;
  125. }
  126. const StrideMap& stride_map() const {
  127. return stride_map_;
  128. }
  129. void set_stride_map(const StrideMap& map) {
  130. stride_map_ = map;
  131. }
  132. const GENERIC_2D_ARRAY<float>& float_array() const { return f_; }
  133. GENERIC_2D_ARRAY<float>* mutable_float_array() { return &f_; }
  134. // Copies a single time step from src.
  135. void CopyTimeStepFrom(int dest_t, const NetworkIO& src, int src_t);
  136. // Copies a part of single time step from src.
  137. void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features,
  138. const NetworkIO& src, int src_t, int src_offset);
  139. // Zeroes a single time step.
  140. void ZeroTimeStep(int t) { ZeroTimeStepGeneral(t, 0, NumFeatures()); }
  141. void ZeroTimeStepGeneral(int t, int offset, int num_features);
  142. // Sets the given range to random values.
  143. void Randomize(int t, int offset, int num_features, TRand* randomizer);
  144. // Helper returns the label and score of the best choice over a range.
  145. int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch,
  146. float* rating, float* certainty) const;
  147. // Helper returns the rating and certainty of the choice over a range in t.
  148. void ScoresOverRange(int t_start, int t_end, int choice, int null_ch,
  149. float* rating, float* certainty) const;
  150. // Returns the index (label) of the best value at the given timestep,
  151. // and if not null, sets the score to the log of the corresponding value.
  152. int BestLabel(int t, float* score) const {
  153. return BestLabel(t, -1, -1, score);
  154. }
  155. // Returns the index (label) of the best value at the given timestep,
  156. // excluding not_this and not_that, and if not null, sets the score to the
  157. // log of the corresponding value.
  158. int BestLabel(int t, int not_this, int not_that, float* score) const;
  159. // Returns the best start position out of range (into which both start and end
  160. // must fit) to obtain the highest cumulative score for the given labels.
  161. int PositionOfBestMatch(const GenericVector<int>& labels, int start,
  162. int end) const;
  163. // Returns the cumulative score of the given labels starting at start, and
  164. // using one label per time-step.
  165. double ScoreOfLabels(const GenericVector<int>& labels, int start) const;
  166. // Helper function sets all the outputs for a single timestep, such that
  167. // label has value ok_score, and the other labels share 1 - ok_score.
  168. // Assumes float mode.
  169. void SetActivations(int t, int label, float ok_score);
  170. // Modifies the values, only if needed, so that the given label is
  171. // the winner at the given time step t.
  172. // Assumes float mode.
  173. void EnsureBestLabel(int t, int label);
  174. // Helper function converts prob to certainty taking the minimum into account.
  175. static float ProbToCertainty(float prob);
  176. // Returns true if there is any bad value that is suspiciously like a GT
  177. // error. Assuming that *this is the difference(gradient) between target
  178. // and forward output, returns true if there is a large negative value
  179. // (correcting a very confident output) for which there is no corresponding
  180. // positive value in an adjacent timestep for the same feature index. This
  181. // allows the box-truthed samples to make fine adjustments to position while
  182. // stopping other disagreements of confident output with ground truth.
  183. bool AnySuspiciousTruth(float confidence_thr) const;
  184. // Reads a single timestep to floats in the range [-1, 1].
  185. void ReadTimeStep(int t, double* output) const;
  186. // Adds a single timestep to floats.
  187. void AddTimeStep(int t, double* inout) const;
  188. // Adds part of a single timestep to floats.
  189. void AddTimeStepPart(int t, int offset, int num_features, float* inout) const;
  190. // Writes a single timestep from floats in the range [-1, 1].
  191. void WriteTimeStep(int t, const double* input);
  192. // Writes a single timestep from floats in the range [-1, 1] writing only
  193. // num_features elements of input to (*this)[t], starting at offset.
  194. void WriteTimeStepPart(int t, int offset, int num_features,
  195. const double* input);
  196. // Maxpools a single time step from src.
  197. void MaxpoolTimeStep(int dest_t, const NetworkIO& src, int src_t,
  198. int* max_line);
  199. // Runs maxpool backward, using maxes to index timesteps in *this.
  200. void MaxpoolBackward(const NetworkIO& fwd,
  201. const GENERIC_2D_ARRAY<int>& maxes);
  202. // Returns the min over time of the maxes over features of the outputs.
  203. float MinOfMaxes() const;
  204. // Returns the min over time.
  205. float Max() const { return int_mode_ ? i_.Max() : f_.Max(); }
  206. // Computes combined results for a combiner that chooses between an existing
  207. // input and itself, with an additional output to indicate the choice.
  208. void CombineOutputs(const NetworkIO& base_output,
  209. const NetworkIO& combiner_output);
  210. // Computes deltas for a combiner that chooses between 2 sets of inputs.
  211. void ComputeCombinerDeltas(const NetworkIO& fwd_deltas,
  212. const NetworkIO& base_output);
  213. // Copies the array checking that the types match.
  214. void CopyAll(const NetworkIO& src);
  215. // Adds the array to a float array, with scaling to [-1, 1] if the src is int.
  216. void AddAllToFloat(const NetworkIO& src);
  217. // Subtracts the array from a float array. src must also be float.
  218. void SubtractAllFromFloat(const NetworkIO& src);
  219. // Copies src to *this, with maxabs normalization to match scale.
  220. void CopyWithNormalization(const NetworkIO& src, const NetworkIO& scale);
  221. // Multiplies the float data by the given factor.
  222. void ScaleFloatBy(float factor) { f_ *= factor; }
  223. // Copies src to *this with independent reversal of the y dimension.
  224. void CopyWithYReversal(const NetworkIO& src);
  225. // Copies src to *this with independent reversal of the x dimension.
  226. void CopyWithXReversal(const NetworkIO& src);
  227. // Copies src to *this with independent transpose of the x and y dimensions.
  228. void CopyWithXYTranspose(const NetworkIO& src);
  229. // Copies src to *this, at the given feature_offset, returning the total
  230. // feature offset after the copy. Multiple calls will stack outputs from
  231. // multiple sources in feature space.
  232. int CopyPacking(const NetworkIO& src, int feature_offset);
  233. // Opposite of CopyPacking, fills *this with a part of src, starting at
  234. // feature_offset, and picking num_features. Resizes *this to match.
  235. void CopyUnpacking(const NetworkIO& src, int feature_offset,
  236. int num_features);
  237. // Transposes the float part of *this into dest.
  238. void Transpose(TransposedArray* dest) const;
  239. // Clips the content of a single time-step to +/-range.
  240. void ClipVector(int t, float range);
  241. // Applies Func to timestep t of *this (u) and multiplies the result by v
  242. // component-wise, putting the product in *product.
  243. // *this and v may be int or float, but must match. The outputs are double.
  244. template <class Func>
  245. void FuncMultiply(const NetworkIO& v_io, int t, double* product) {
  246. Func f;
  247. ASSERT_HOST(!int_mode_);
  248. ASSERT_HOST(!v_io.int_mode_);
  249. int dim = f_.dim2();
  250. if (int_mode_) {
  251. const int8_t* u = i_[t];
  252. const int8_t* v = v_io.i_[t];
  253. for (int i = 0; i < dim; ++i) {
  254. product[i] = f(u[i] / static_cast<double>(INT8_MAX)) * v[i] /
  255. static_cast<double>(INT8_MAX);
  256. }
  257. } else {
  258. const float* u = f_[t];
  259. const float* v = v_io.f_[t];
  260. for (int i = 0; i < dim; ++i) {
  261. product[i] = f(u[i]) * v[i];
  262. }
  263. }
  264. }
  265. // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,
  266. // component-wise, putting the product in *product.
  267. // All NetworkIOs are assumed to be float.
  268. template <class Func>
  269. void FuncMultiply3(int u_t, const NetworkIO& v_io, int v_t, const double* w,
  270. double* product) const {
  271. ASSERT_HOST(!int_mode_);
  272. ASSERT_HOST(!v_io.int_mode_);
  273. Func f;
  274. const float* u = f_[u_t];
  275. const float* v = v_io.f_[v_t];
  276. int dim = f_.dim2();
  277. for (int i = 0; i < dim; ++i) {
  278. product[i] = f(u[i]) * v[i] * w[i];
  279. }
  280. }
  281. // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,
  282. // component-wise, adding the product to *product.
  283. // All NetworkIOs are assumed to be float.
  284. template <class Func>
  285. void FuncMultiply3Add(const NetworkIO& v_io, int t, const double* w,
  286. double* product) const {
  287. ASSERT_HOST(!int_mode_);
  288. ASSERT_HOST(!v_io.int_mode_);
  289. Func f;
  290. const float* u = f_[t];
  291. const float* v = v_io.f_[t];
  292. int dim = f_.dim2();
  293. for (int i = 0; i < dim; ++i) {
  294. product[i] += f(u[i]) * v[i] * w[i];
  295. }
  296. }
  297. // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w,
  298. // component-wise, putting the product in product, all at timestep t, except
  299. // w, which is a simple array. All NetworkIOs are assumed to be float.
  300. template <class Func1, class Func2>
  301. void Func2Multiply3(const NetworkIO& v_io, int t, const double* w,
  302. double* product) const {
  303. ASSERT_HOST(!int_mode_);
  304. ASSERT_HOST(!v_io.int_mode_);
  305. Func1 f;
  306. Func2 g;
  307. const float* u = f_[t];
  308. const float* v = v_io.f_[t];
  309. int dim = f_.dim2();
  310. for (int i = 0; i < dim; ++i) {
  311. product[i] = f(u[i]) * g(v[i]) * w[i];
  312. }
  313. }
  314. private:
  315. // Returns the padding required for the given number of features in order
  316. // for the SIMD operations to be safe.
  317. static int GetPadding(int num_features);
  318. // Choice of float vs 8 bit int for data.
  319. GENERIC_2D_ARRAY<float> f_;
  320. GENERIC_2D_ARRAY<int8_t> i_;
  321. // Which of f_ and i_ are we actually using.
  322. bool int_mode_;
  323. // Stride for 2d input data.
  324. StrideMap stride_map_;
  325. };
  326. } // namespace tesseract.
  327. #endif // TESSERACT_LSTM_NETWORKIO_H_