tesseract/src/ccstruct/imagedata.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

///////////////////////////////////////////////////////////////////////
// File:        imagedata.h
// Description: Class to hold information about a single image and its
//              corresponding boxes or text file.
// Author:      Ray Smith
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_IMAGE_IMAGEDATA_H_
#define TESSERACT_IMAGE_IMAGEDATA_H_

#include "points.h"             // for FCOORD

#include "genericvector.h"      // for GenericVector, PointerVector, FileReader
#include "strngs.h"   // for STRING

#include <mutex>                // for std::mutex
#include <thread>               // for std::thread

struct Pix;

namespace tesseract {

class TFile;
class ScrollView;
class TBOX;

// Amount of padding to apply in output pixels in feature mode.
const int kFeaturePadding = 2;
// Number of pixels to pad around text boxes.
const int kImagePadding = 4;

// Enum to determine the caching and data sequencing strategy.
enum CachingStrategy {
  // Reads all of one file before moving on to the next. Requires samples to be
  // shuffled across files. Uses the count of samples in the first file as
  // the count in all the files to achieve high-speed random access. As a
  // consequence, if subsequent files are smaller, they get entries used more
  // than once, and if subsequent files are larger, some entries are not used.
  // Best for larger data sets that don't fit in memory.
  CS_SEQUENTIAL,
  // Reads one sample from each file in rotation. Does not require shuffled
  // samples, but is extremely disk-intensive. Samples in smaller files also
  // get used more often than samples in larger files.
  // Best for smaller data sets that mostly fit in memory.
  CS_ROUND_ROBIN,
};

class WordFeature {
 public:
  WordFeature();
  WordFeature(const FCOORD& fcoord, uint8_t dir);

  // Computes the maximum x and y value in the features.
  static void ComputeSize(const GenericVector<WordFeature>& features,
                          int* max_x, int* max_y);
  // Draws the features in the given window.
  static void Draw(const GenericVector<WordFeature>& features,
                   ScrollView* window);

  // Accessors.
  int x() const { return x_; }
  int y() const { return y_; }
  int dir() const { return dir_; }

  // Writes to the given file. Returns false in case of error.
  bool Serialize(FILE* fp) const;
  // Reads from the given file. Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
  bool DeSerialize(bool swap, FILE* fp);

 private:
  int16_t x_;
  uint8_t y_;
  uint8_t dir_;
};

// A floating-point version of WordFeature, used as an intermediate during
// scaling.
struct FloatWordFeature {
  static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
                               GenericVector<FloatWordFeature>* float_features);
  // Sort function to sort first by x-bucket, then by y.
  static int SortByXBucket(const void*, const void*);

  float x;
  float y;
  float dir;
  int x_bucket;
};

// Class to hold information on a single image:
// Filename, cached image as a Pix*, character boxes, text transcription.
// The text transcription is the ground truth UTF-8 text for the image.
// Character boxes are optional and indicate the desired segmentation of
// the text into recognition units.
class TESS_API ImageData {
 public:
  ImageData();
  // Takes ownership of the pix.
  ImageData(bool vertical, Pix* pix);
  ~ImageData();

  // Builds and returns an ImageData from the basic data. Note that imagedata,
  // truth_text, and box_text are all the actual file data, NOT filenames.
  static ImageData* Build(const char* name, int page_number, const char* lang,
                          const char* imagedata, int imagedatasize,
                          const char* truth_text, const char* box_text);

  // Writes to the given file. Returns false in case of error.
  bool Serialize(TFile* fp) const;
  // Reads from the given file. Returns false in case of error.
  bool DeSerialize(TFile* fp);
  // As DeSerialize, but only seeks past the data - hence a static method.
  static bool SkipDeSerialize(TFile* fp);

  // Other accessors.
  const STRING& imagefilename() const {
    return imagefilename_;
  }
  void set_imagefilename(const STRING& name) {
    imagefilename_ = name;
  }
  int page_number() const {
    return page_number_;
  }
  void set_page_number(int num) {
    page_number_ = num;
  }
  const GenericVector<char>& image_data() const {
    return image_data_;
  }
  const STRING& language() const {
    return language_;
  }
  void set_language(const STRING& lang) {
    language_ = lang;
  }
  const STRING& transcription() const {
    return transcription_;
  }
  const GenericVector<TBOX>& boxes() const {
    return boxes_;
  }
  const GenericVector<STRING>& box_texts() const {
    return box_texts_;
  }
  const STRING& box_text(int index) const {
    return box_texts_[index];
  }
  // Saves the given Pix as a PNG-encoded string and destroys it.
  // In case of missing PNG support in Leptonica use PNM format,
  // which requires more memory.
  void SetPix(Pix* pix);
  // Returns the Pix image for *this. Must be pixDestroyed after use.
  Pix* GetPix() const;
  // Gets anything and everything with a non-nullptr pointer, prescaled to a
  // given target_height (if 0, then the original image height), and aligned.
  // Also returns (if not nullptr) the width and height of the scaled image.
  // The return value is the scaled Pix, which must be pixDestroyed after use,
  // and scale_factor (if not nullptr) is set to the scale factor that was applied
  // to the image to achieve the target_height.
  Pix* PreScale(int target_height, int max_height, float* scale_factor,
                int* scaled_width, int* scaled_height,
                GenericVector<TBOX>* boxes) const;

  int MemoryUsed() const;

  // Draws the data in a new window.
  void Display() const;

  // Adds the supplied boxes and transcriptions that correspond to the correct
  // page number.
  void AddBoxes(const std::vector<TBOX>& boxes,
                const std::vector<STRING>& texts,
                const std::vector<int>& box_pages);

 private:
  // Saves the given Pix as a PNG-encoded string and destroys it.
  // In case of missing PNG support in Leptonica use PNM format,
  // which requires more memory.
  static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
  static Pix* GetPixInternal(const GenericVector<char>& image_data);
  // Parses the text string as a box file and adds any discovered boxes that
  // match the page number. Returns false on error.
  bool AddBoxes(const char* box_text);

 private:
  STRING imagefilename_;             // File to read image from.
  int32_t page_number_;              // Page number if multi-page tif or -1.
#ifdef TESSERACT_IMAGEDATA_AS_PIX
  Pix *internal_pix_;
#endif
  GenericVector<char> image_data_;   // PNG/PNM file data.
  STRING language_;                  // Language code for image.
  STRING transcription_;             // UTF-8 ground truth of image.
  GenericVector<TBOX> boxes_;        // If non-empty boxes of the image.
  GenericVector<STRING> box_texts_;  // String for text in each box.
  bool vertical_text_;               // Image has been rotated from vertical.
};

// A collection of ImageData that knows roughly how much memory it is using.
class DocumentData {
 public:
  TESS_API
  explicit DocumentData(const STRING& name);
  TESS_API
  ~DocumentData();

  // Reads all the pages in the given lstmf filename to the cache. The reader
  // is used to read the file.
  TESS_API
  bool LoadDocument(const char* filename, int start_page, int64_t max_memory,
                    FileReader reader);
  // Sets up the document, without actually loading it.
  void SetDocument(const char* filename, int64_t max_memory, FileReader reader);
  // Writes all the pages to the given filename. Returns false on error.
  TESS_API
  bool SaveDocument(const char* filename, FileWriter writer);

  // Adds the given page data to this document, counting up memory.
  TESS_API
  void AddPageToDocument(ImageData* page);

  const STRING& document_name() const {
    std::lock_guard<std::mutex> lock(general_mutex_);
    return document_name_;
  }
  int NumPages() const {
    std::lock_guard<std::mutex> lock(general_mutex_);
    return total_pages_;
  }
  size_t PagesSize() const {
    return pages_.size();
  }
  int64_t memory_used() const {
    std::lock_guard<std::mutex> lock(general_mutex_);
    return memory_used_;
  }
  // If the given index is not currently loaded, loads it using a separate
  // thread. Note: there are 4 cases:
  // Document uncached: IsCached() returns false, total_pages_ < 0.
  // Required page is available: IsPageAvailable returns true. In this case,
  // total_pages_ > 0 and
  // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
  // Pages are loaded, but the required one is not.
  // The requested page is being loaded by LoadPageInBackground. In this case,
  // index == pages_offset_. Once the loading starts, the pages lock is held
  // until it completes, at which point IsPageAvailable will unblock and return
  // true.
  void LoadPageInBackground(int index);
  // Returns a pointer to the page with the given index, modulo the total
  // number of pages. Blocks until the background load is completed.
  TESS_API
  const ImageData* GetPage(int index);
  // Returns true if the requested page is available, and provides a pointer,
  // which may be nullptr if the document is empty. May block, even though it
  // doesn't guarantee to return true.
  bool IsPageAvailable(int index, ImageData** page);
  // Takes ownership of the given page index. The page is made nullptr in *this.
  ImageData* TakePage(int index) {
    std::lock_guard<std::mutex> lock(pages_mutex_);
    ImageData* page = pages_[index];
    pages_[index] = nullptr;
    return page;
  }
  // Returns true if the document is currently loaded or in the process of
  // loading.
  bool IsCached() const { return NumPages() >= 0; }
  // Removes all pages from memory and frees the memory, but does not forget
  // the document metadata. Returns the memory saved.
  int64_t UnCache();
  // Shuffles all the pages in the document.
  void Shuffle();

 private:
  // Sets the value of total_pages_ behind a mutex.
  void set_total_pages(int total) {
    std::lock_guard<std::mutex> lock(general_mutex_);
    total_pages_ = total;
  }
  void set_memory_used(int64_t memory_used) {
    std::lock_guard<std::mutex> lock(general_mutex_);
    memory_used_ = memory_used;
  }
  // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
  // starting at index pages_offset_.
  bool ReCachePages();

 private:
  // A name for this document.
  STRING document_name_;
  // A group of pages that corresponds in some loose way to a document.
  PointerVector<ImageData> pages_;
  // Page number of the first index in pages_.
  int pages_offset_;
  // Total number of pages in document (may exceed size of pages_.)
  int total_pages_;
  // Total of all pix sizes in the document.
  int64_t memory_used_;
  // Max memory to use at any time.
  int64_t max_memory_;
  // Saved reader from LoadDocument to allow re-caching.
  FileReader reader_;
  // Mutex that protects pages_ and pages_offset_ against multiple parallel
  // loads, and provides a wait for page.
  std::mutex pages_mutex_;
  // Mutex that protects other data members that callers want to access without
  // waiting for a load operation.
  mutable std::mutex general_mutex_;

  // Thread which loads document.
  std::thread thread;
};

// A collection of DocumentData that knows roughly how much memory it is using.
// Note that while it supports background read-ahead, it assumes that a single
// thread is accessing documents, ie it is not safe for multiple threads to
// access different documents in parallel, as one may de-cache the other's
// content.
class DocumentCache {
 public:
  TESS_API
  explicit DocumentCache(int64_t max_memory);
  TESS_API
  ~DocumentCache();

  // Deletes all existing documents from the cache.
  void Clear() {
    documents_.clear();
    num_pages_per_doc_ = 0;
  }
  // Adds all the documents in the list of filenames, counting memory.
  // The reader is used to read the files.
  TESS_API
  bool LoadDocuments(const std::vector<STRING>& filenames,
                     CachingStrategy cache_strategy, FileReader reader);

  // Adds document to the cache.
  bool AddToCache(DocumentData* data);

  // Finds and returns a document by name.
  DocumentData* FindDocument(const STRING& document_name) const;

  // Returns a page by serial number using the current cache_strategy_ to
  // determine the mapping from serial number to page.
  const ImageData* GetPageBySerial(int serial) {
    if (cache_strategy_ == CS_SEQUENTIAL)
      return GetPageSequential(serial);
    else
      return GetPageRoundRobin(serial);
  }

  const PointerVector<DocumentData>& documents() const {
    return documents_;
  }
  // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
  // strategy, could take a long time.
  TESS_API
  int TotalPages();

 private:
  // Returns a page by serial number, selecting them in a round-robin fashion
  // from all the documents. Highly disk-intensive, but doesn't need samples
  // to be shuffled between files to begin with.
  TESS_API
  const ImageData* GetPageRoundRobin(int serial);
  // Returns a page by serial number, selecting them in sequence from each file.
  // Requires the samples to be shuffled between the files to give a random or
  // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
  TESS_API
  const ImageData* GetPageSequential(int serial);

  // Helper counts the number of adjacent cached neighbour documents_ of index
  // looking in direction dir, ie index+dir, index+2*dir etc.
  int CountNeighbourDocs(int index, int dir);

  // A group of pages that corresponds in some loose way to a document.
  PointerVector<DocumentData> documents_;
  // Strategy to use for caching and serializing data samples.
  CachingStrategy cache_strategy_;
  // Number of pages in the first document, used as a divisor in
  // GetPageSequential to determine the document index.
  int num_pages_per_doc_;
  // Max memory allowed in this cache.
  int64_t max_memory_;
};

}  // namespace tesseract


#endif  // TESSERACT_IMAGE_IMAGEDATA_H_