diff options
Diffstat (limited to 'tesseract/src/ccstruct/werd.h')
-rw-r--r-- | tesseract/src/ccstruct/werd.h | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/tesseract/src/ccstruct/werd.h b/tesseract/src/ccstruct/werd.h new file mode 100644 index 00000000..4e190960 --- /dev/null +++ b/tesseract/src/ccstruct/werd.h @@ -0,0 +1,199 @@ +/********************************************************************** + * File: werd.h + * Description: Code for the WERD class. + * Author: Ray Smith + * + * (C) Copyright 1991, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifndef WERD_H +#define WERD_H + +#include "bits16.h" +#include "elst2.h" +#include "params.h" +#include "stepblob.h" + +#include "strngs.h" + +namespace tesseract { + +enum WERD_FLAGS { + W_SEGMENTED, ///< correctly segmented + W_ITALIC, ///< italic text + W_BOLD, ///< bold text + W_BOL, ///< start of line + W_EOL, ///< end of line + W_NORMALIZED, ///< flags + W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense. + W_SCRIPT_IS_LATIN, ///< Special case latin for y. splitting. + W_DONT_CHOP, ///< fixed pitch chopped + W_REP_CHAR, ///< repeated character + W_FUZZY_SP, ///< fuzzy space + W_FUZZY_NON, ///< fuzzy nonspace + W_INVERSE ///< white on black +}; + +enum DISPLAY_FLAGS { + /* Display flags bit number allocations */ + DF_BOX, ///< Bounding box + DF_TEXT, ///< Correct ascii + DF_POLYGONAL, ///< Polyg approx + DF_EDGE_STEP, ///< Edge steps + DF_BN_POLYGONAL, ///< BL normalisd polyapx + DF_BLAMER ///< Blamer information +}; + +class ROW; // forward decl + +class TESS_API WERD : public ELIST2_LINK { + public: + WERD() = default; + // WERD constructed with: + // blob_list - blobs of the word (we take this list's contents) + // blanks - number of blanks before the word + // text - correct text (outlives WERD) + WERD(C_BLOB_LIST* blob_list, uint8_t blanks, const char* text); + + // WERD constructed from: + // blob_list - blobs in the word + // clone - werd to clone flags, etc from. + WERD(C_BLOB_LIST* blob_list, WERD* clone); + + // Construct a WERD from a single_blob and clone the flags from this. + // W_BOL and W_EOL flags are set according to the given values. + WERD* ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob); + + ~WERD() = default; + + // assignment + WERD& operator=(const WERD& source); + + // This method returns a new werd constructed using the blobs in the input + // all_blobs list, which correspond to the blobs in this werd object. The + // blobs used to construct the new word are consumed and removed from the + // input all_blobs list. + // Returns nullptr if the word couldn't be constructed. + // Returns original blobs for which no matches were found in the output list + // orphan_blobs (appends). + WERD* ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs, + C_BLOB_LIST* orphan_blobs); + + // Accessors for reject / DUFF blobs in various formats + C_BLOB_LIST* rej_cblob_list() { // compact format + return &rej_cblobs; + } + + // Accessors for good blobs in various formats. + C_BLOB_LIST* cblob_list() { // get compact blobs + return &cblobs; + } + + uint8_t space() { // access function + return blanks; + } + void set_blanks(uint8_t new_blanks) { blanks = new_blanks; } + int script_id() const { return script_id_; } + void set_script_id(int id) { script_id_ = id; } + + // Returns the (default) bounding box including all the dots. + TBOX bounding_box() const; // compute bounding box + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const; + + const char* text() const { return correct.c_str(); } + void set_text(const char* new_text) { correct = new_text; } + + bool flag(WERD_FLAGS mask) const { return flags[mask]; } + void set_flag(WERD_FLAGS mask, bool value) { flags.set(mask, value); } + + bool display_flag(uint8_t flag) const { return disp_flags[flag]; } + void set_display_flag(uint8_t flag, bool value) { + disp_flags.set(flag, value); + } + + WERD* shallow_copy(); // shallow copy word + + // reposition word by vector + void move(const ICOORD vec); + + // join other's blobs onto this werd, emptying out other. + void join_on(WERD* other); + + // copy other's blobs onto this word, leaving other intact. + void copy_on(WERD* other); + + // tprintf word metadata (but not blob innards) + void print(); + +#ifndef GRAPHICS_DISABLED + // plot word on window in a uniform colour + void plot(ScrollView* window, ScrollView::Color colour); + + // Get the next color in the (looping) rainbow. + static ScrollView::Color NextColor(ScrollView::Color colour); + + // plot word on window in a rainbow of colours + void plot(ScrollView* window); + + // plot rejected blobs in a rainbow of colours + void plot_rej_blobs(ScrollView* window); +#endif // !GRAPHICS_DISABLED + + // Removes noise from the word by moving small outlines to the rej_cblobs + // list, based on the size_threshold. + void CleanNoise(float size_threshold); + + // Extracts all the noise outlines and stuffs the pointers into the given + // vector of outlines. Afterwards, the outlines vector owns the pointers. + void GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines); + // Adds the selected outlines to the indcated real blobs, and puts the rest + // back in rej_cblobs where they came from. Where the target_blobs entry is + // nullptr, a run of wanted outlines is put into a single new blob. + // Ownership of the outlines is transferred back to the word. (Hence + // GenericVector and not PointerVector.) + // Returns true if any new blob was added to the start of the word, which + // suggests that it might need joining to the word before it, and likewise + // sets make_next_word_fuzzy true if any new blob was added to the end. + bool AddSelectedOutlines(const GenericVector<bool>& wanted, + const GenericVector<C_BLOB*>& target_blobs, + const GenericVector<C_OUTLINE*>& outlines, + bool* make_next_word_fuzzy); + + private: + uint8_t blanks = 0; // no of blanks + BITS16 flags; // flags about word + BITS16 disp_flags; // display flags + int16_t script_id_ = 0; // From unicharset. + STRING correct; // correct text + C_BLOB_LIST cblobs; // compacted blobs + C_BLOB_LIST rej_cblobs; // DUFF blobs +}; + +ELIST2IZEH(WERD) + +} // namespace tesseract + +#include "ocrrow.h" // placed here due to + +namespace tesseract { + +// compare words by increasing order of left edge, suitable for qsort(3) +int word_comparator(const void* word1p, const void* word2p); + +} // namespace tesseract + +#endif |