1 /**********************************************************************
2 * File: makerow.h (Formerly makerows.h)
3 * Description: Code to arrange blobs into rows of text.
4 * Author: Ray Smith
5 * Created: Mon Sep 21 14:34:48 BST 1992
6 *
7 * (C) Copyright 1992, Hewlett-Packard Ltd.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #ifndef MAKEROW_H
21 #define MAKEROW_H
22
23 #include "varable.h"
24 #include "ocrblock.h"
25 #include "tessclas.h"
26 #include "blobbox.h"
27 #include "statistc.h"
28 #include "notdll.h"
29 #include "tesseractclass.h"
30
31 enum OVERLAP_STATE
32 {
33 ASSIGN, //assign it to row
34 REJECT, //reject it - dual overlap
35 NEW_ROW
36 };
37
38 enum ROW_CATEGORY {
39 ROW_ASCENDERS_FOUND,
40 ROW_DESCENDERS_FOUND,
41 ROW_UNKNOWN,
42 ROW_INVALID,
43 };
44
45 extern BOOL_VAR_H (textord_show_initial_rows, FALSE,
46 "Display row accumulation");
47 extern BOOL_VAR_H (textord_show_parallel_rows, FALSE,
48 "Display page correlated rows");
49 extern BOOL_VAR_H (textord_show_expanded_rows, FALSE,
50 "Display rows after expanding");
51 extern BOOL_VAR_H (textord_show_final_rows, FALSE,
52 "Display rows after final fitting");
53 extern BOOL_VAR_H (textord_show_final_blobs, FALSE,
54 "Display blob bounds after pre-ass");
55 extern BOOL_VAR_H (textord_test_landscape, FALSE, "Tests refer to land/port");
56 extern BOOL_VAR_H (textord_parallel_baselines, TRUE,
57 "Force parallel baselines");
58 extern BOOL_VAR_H (textord_straight_baselines, FALSE,
59 "Force straight baselines");
60 extern BOOL_VAR_H (textord_quadratic_baselines, FALSE,
61 "Use quadratic splines");
62 extern BOOL_VAR_H (textord_old_baselines, TRUE, "Use old baseline algorithm");
63 extern BOOL_VAR_H (textord_old_xheight, TRUE, "Use old xheight algorithm");
64 extern BOOL_VAR_H (textord_fix_xheight_bug, TRUE, "Use spline baseline");
65 extern BOOL_VAR_H (textord_fix_makerow_bug, TRUE,
66 "Prevent multiple baselines");
67 extern BOOL_VAR_H (textord_cblob_blockocc, TRUE,
68 "Use new projection for underlines");
69 extern BOOL_VAR_H (textord_debug_xheights, FALSE, "Test xheight algorithms");
70 extern INT_VAR_H (textord_test_x, 0, "coord of test pt");
71 extern INT_VAR_H (textord_test_y, 0, "coord of test pt");
72 extern INT_VAR_H (textord_min_blobs_in_row, 4,
73 "Min blobs before gradient counted");
74 extern INT_VAR_H (textord_spline_minblobs, 8,
75 "Min blobs in each spline segment");
76 extern INT_VAR_H (textord_spline_medianwin, 6,
77 "Size of window for spline segmentation");
78 extern INT_VAR_H (textord_min_xheight, 10, "Min credible pixel xheight");
79 extern double_VAR_H (textord_spline_shift_fraction, 0.02,
80 "Fraction of line spacing for quad");
81 extern double_VAR_H (textord_spline_outlier_fraction, 0.1,
82 "Fraction of line spacing for outlier");
83 extern double_VAR_H (textord_skew_ile, 0.5, "Ile of gradients for page skew");
84 extern double_VAR_H (textord_skew_lag, 0.75,
85 "Lag for skew on row accumulation");
86 extern double_VAR_H (textord_linespace_iqrlimit, 0.2,
87 "Max iqr/median for linespace");
88 extern double_VAR_H (textord_width_limit, 8,
89 "Max width of blobs to make rows");
90 extern double_VAR_H (textord_chop_width, 1.5, "Max width before chopping");
91 extern double_VAR_H (textord_merge_desc, 0.25,
92 "Fraction of linespace for desc drop");
93 extern double_VAR_H (textord_merge_x, 0.5,
94 "Fraction of linespace for x height");
95 extern double_VAR_H (textord_merge_asc, 0.25,
96 "Fraction of linespace for asc height");
97 extern double_VAR_H (textord_minxh, 0.25,
98 "fraction of linesize for min xheight");
99 extern double_VAR_H (textord_min_linesize, 1.25,
100 "* blob height for initial linesize");
101 extern double_VAR_H (textord_excess_blobsize, 1.3,
102 "New row made if blob makes row this big");
103 extern double_VAR_H (textord_occupancy_threshold, 0.4,
104 "Fraction of neighbourhood");
105 extern double_VAR_H (textord_underline_width, 2.0,
106 "Multiple of line_size for underline");
107 extern double_VAR_H(textord_min_blob_height_fraction, 0.75,
108 "Min blob height/top to include blob top into xheight stats");
109 extern double_VAR_H (textord_xheight_mode_fraction, 0.4,
110 "Min pile height to make xheight");
111 extern double_VAR_H (textord_ascheight_mode_fraction, 0.15,
112 "Min pile height to make ascheight");
113 extern double_VAR_H (textord_ascx_ratio_min, 1.2, "Min cap/xheight");
114 extern double_VAR_H (textord_ascx_ratio_max, 1.7, "Max cap/xheight");
115 extern double_VAR_H (textord_descx_ratio_min, 0.15, "Min desc/xheight");
116 extern double_VAR_H (textord_descx_ratio_max, 0.6, "Max desc/xheight");
117 extern double_VAR_H (textord_xheight_error_margin, 0.1, "Accepted variation");
118
get_min_max_xheight(double block_linesize,int * min_height,int * max_height)119 inline void get_min_max_xheight(double block_linesize,
120 int *min_height, int *max_height) {
121 *min_height = static_cast<inT32>(floor(block_linesize * textord_minxh));
122 if (*min_height < textord_min_xheight) *min_height = textord_min_xheight;
123 *max_height = static_cast<inT32>(ceil(block_linesize * 3));
124 }
125
get_row_category(const TO_ROW * row)126 inline ROW_CATEGORY get_row_category(const TO_ROW *row) {
127 if (row->xheight <= 0) return ROW_INVALID;
128 return (row->ascrise > 0) ? ROW_ASCENDERS_FOUND :
129 (row->descdrop != 0) ? ROW_DESCENDERS_FOUND : ROW_UNKNOWN;
130 }
131
within_error_margin(float test,float num,float margin)132 inline bool within_error_margin(float test, float num, float margin) {
133 return (test >= num * (1 - margin) && test <= num * (1 + margin));
134 }
135
136 void fill_heights(TO_ROW *row, float gradient, int min_height,
137 int max_height, STATS *heights, STATS *floating_heights);
138
139 float make_single_row(ICOORD page_tr, TO_BLOCK* block,
140 TO_BLOCK_LIST* blocks, tesseract::Tesseract* tess);
141 float make_rows( //make rows
142 ICOORD page_tr, //top right
143 BLOCK_LIST *blocks, //block list
144 TO_BLOCK_LIST *land_blocks, //rotated for landscape
145 TO_BLOCK_LIST *port_blocks, //output list
146 tesseract::Tesseract* tess
147 );
148 void make_initial_textrows( //find lines
149 ICOORD page_tr,
150 TO_BLOCK *block, //block to do
151 FCOORD rotation, //for drawing
152 BOOL8 testing_on //correct orientation
153 );
154 void fit_lms_line( //sort function
155 TO_ROW *row //row to fit
156 );
157 void compute_page_skew( //get average gradient
158 TO_BLOCK_LIST *blocks, //list of blocks
159 float &page_m, //average gradient
160 float &page_err //average error
161 );
162 void cleanup_rows( //find lines
163 ICOORD page_tr, //top right
164 TO_BLOCK *block, //block to do
165 float gradient, //gradient to fit
166 FCOORD rotation, //for drawing
167 inT32 block_edge, //edge of block
168 BOOL8 testing_on, //correct orientation
169 tesseract::Tesseract* tess
170 );
171 void delete_non_dropout_rows( //find lines
172 TO_BLOCK *block, //block to do
173 float gradient, //global skew
174 FCOORD rotation, //deskew vector
175 inT32 block_edge, //left edge
176 BOOL8 testing_on //correct orientation
177 );
178 BOOL8 find_best_dropout_row( //find neighbours
179 TO_ROW *row, //row to test
180 inT32 distance, //dropout dist
181 float dist_limit, //threshold distance
182 inT32 line_index, //index of row
183 TO_ROW_IT *row_it, //current position
184 BOOL8 testing_on //correct orientation
185 );
186 TBOX deskew_block_coords( //block box
187 TO_BLOCK *block, //block to do
188 float gradient //global skew
189 );
190 void compute_line_occupation( //project blobs
191 TO_BLOCK *block, //block to do
192 float gradient, //global skew
193 inT32 min_y, //min coord in block
194 inT32 max_y, //in block
195 inT32 *occupation, //output projection
196 inT32 *deltas //derivative
197 );
198 void compute_occupation_threshold( //project blobs
199 inT32 low_window, //below result point
200 inT32 high_window, //above result point
201 inT32 line_count, //array sizes
202 inT32 *occupation, //input projection
203 inT32 *thresholds //output thresholds
204 );
205 void compute_dropout_distances( //project blobs
206 inT32 *occupation, //input projection
207 inT32 *thresholds, //output thresholds
208 inT32 line_count //array sizes
209 );
210 void expand_rows( //find lines
211 ICOORD page_tr, //top right
212 TO_BLOCK *block, //block to do
213 float gradient, //gradient to fit
214 FCOORD rotation, //for drawing
215 inT32 block_edge, //edge of block
216 BOOL8 testing_on //correct orientation
217 );
218 void adjust_row_limits( //tidy limits
219 TO_BLOCK *block //block to do
220 );
221 void compute_row_stats( //find lines
222 TO_BLOCK *block, //block to do
223 BOOL8 testing_on //correct orientation
224 );
225 void compute_block_xheight( //find lines
226 TO_BLOCK *block, //block to do
227 float gradient, //global skew
228 tesseract::Tesseract* tess
229 );
230 float median_block_xheight( //find lines
231 TO_BLOCK *block, //block to do
232 float gradient //global skew
233 );
234 void compute_row_xheight( //find lines
235 TO_ROW *row, //row to do
236 float gradient, //global skew
237 int block_height, //a guess of block xheight
238 tesseract::Tesseract* tess
239 );
240
241 int compute_xheight_from_modes(
242 STATS *heights, STATS *floating_heights, int min_height,
243 int max_height, float *xheight, float *ascrise);
244
245 inT32 compute_row_descdrop( //find lines
246 TO_ROW *row, //row to do
247 float gradient, // global skew
248 int xheight_blob_count,
249 STATS *heights
250 );
251 inT32 compute_height_modes( //find lines
252 STATS *heights, //stats to search
253 inT32 min_height, //bottom of range
254 inT32 max_height, //top of range
255 inT32 *modes, //output array
256 inT32 maxmodes //size of modes
257 );
258 void correct_row_xheight( //fix bad values
259 TO_ROW *row, //row to fix
260 float xheight, //average values
261 float ascrise,
262 float descdrop);
263 void separate_underlines( //make rough chars
264 TO_BLOCK *block, //block to do
265 float gradient, //skew angle
266 FCOORD rotation, //inverse landscape
267 BOOL8 testing_on //correct orientation
268 );
269 void pre_associate_blobs( //make rough chars
270 ICOORD page_tr, //top right
271 TO_BLOCK *block, //block to do
272 FCOORD rotation, //inverse landscape
273 BOOL8 testing_on //correct orientation
274 );
275 void fit_parallel_rows( //find lines
276 TO_BLOCK *block, //block to do
277 float gradient, //gradient to fit
278 FCOORD rotation, //for drawing
279 inT32 block_edge, //edge of block
280 BOOL8 testing_on //correct orientation
281 );
282 void fit_parallel_lms( //sort function
283 float gradient, //forced gradient
284 TO_ROW *row //row to fit
285 );
286 void make_spline_rows( //find lines
287 TO_BLOCK *block, //block to do
288 float gradient, //gradient to fit
289 FCOORD rotation, //for drawing
290 inT32 block_edge, //edge of block
291 BOOL8 testing_on, //correct orientation
292 tesseract::Tesseract* tess
293 );
294 void make_baseline_spline( //sort function
295 TO_ROW *row, //row to fit
296 TO_BLOCK *block //block it came from
297 );
298 BOOL8 segment_baseline ( //split baseline
299 TO_ROW * row, //row to fit
300 TO_BLOCK * block, //block it came from
301 inT32 & segments, //no fo segments
302 inT32 xstarts[] //coords of segments
303 );
304 double *linear_spline_baseline ( //split baseline
305 TO_ROW * row, //row to fit
306 TO_BLOCK * block, //block it came from
307 inT32 & segments, //no fo segments
308 inT32 xstarts[] //coords of segments
309 );
310 void assign_blobs_to_rows( //find lines
311 TO_BLOCK *block, //block to do
312 float *gradient, //block skew
313 int pass, //identification
314 BOOL8 reject_misses, //chuck big ones out
315 BOOL8 make_new_rows, //add rows for unmatched
316 BOOL8 drawing_skew //draw smoothed skew
317 );
318 //find best row
319 OVERLAP_STATE most_overlapping_row(TO_ROW_IT *row_it, //iterator
320 TO_ROW *&best_row, //output row
321 float top, //top of blob
322 float bottom, //bottom of blob
323 float rowsize, //max row size
324 BOOL8 testing_blob //test stuff
325 );
326 int blob_x_order( //sort function
327 const void *item1, //items to compare
328 const void *item2);
329 int row_y_order( //sort function
330 const void *item1, //items to compare
331 const void *item2);
332 int row_spacing_order( //sort function
333 const void *item1, //items to compare
334 const void *item2);
335
336 void mark_repeated_chars(TO_ROW *row, float block_xheight,
337 tesseract::Tesseract *tess);
338 #endif
339