1 // Copyright 2014 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef PUBLIC_FPDF_TEXT_H_ 8 #define PUBLIC_FPDF_TEXT_H_ 9 10 // NOLINTNEXTLINE(build/include) 11 #include "fpdfview.h" 12 13 // Exported Functions 14 #ifdef __cplusplus 15 extern "C" { 16 #endif 17 18 // Function: FPDFText_LoadPage 19 // Prepare information about all characters in a page. 20 // Parameters: 21 // page - Handle to the page. Returned by FPDF_LoadPage function 22 // (in FPDFVIEW module). 23 // Return value: 24 // A handle to the text page information structure. 25 // NULL if something goes wrong. 26 // Comments: 27 // Application must call FPDFText_ClosePage to release the text page 28 // information. 29 // 30 FPDF_EXPORT FPDF_TEXTPAGE FPDF_CALLCONV FPDFText_LoadPage(FPDF_PAGE page); 31 32 // Function: FPDFText_ClosePage 33 // Release all resources allocated for a text page information 34 // structure. 35 // Parameters: 36 // text_page - Handle to a text page information structure. 37 // Returned by FPDFText_LoadPage function. 38 // Return Value: 39 // None. 40 // 41 FPDF_EXPORT void FPDF_CALLCONV FPDFText_ClosePage(FPDF_TEXTPAGE text_page); 42 43 // Function: FPDFText_CountChars 44 // Get number of characters in a page. 45 // Parameters: 46 // text_page - Handle to a text page information structure. 47 // Returned by FPDFText_LoadPage function. 48 // Return value: 49 // Number of characters in the page. Return -1 for error. 50 // Generated characters, like additional space characters, new line 51 // characters, are also counted. 52 // Comments: 53 // Characters in a page form a "stream", inside the stream, each 54 // character has an index. 55 // We will use the index parameters in many of FPDFTEXT functions. The 56 // first character in the page 57 // has an index value of zero. 58 // 59 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountChars(FPDF_TEXTPAGE text_page); 60 61 // Function: FPDFText_GetUnicode 62 // Get Unicode of a character in a page. 63 // Parameters: 64 // text_page - Handle to a text page information structure. 65 // Returned by FPDFText_LoadPage function. 66 // index - Zero-based index of the character. 67 // Return value: 68 // The Unicode of the particular character. 69 // If a character is not encoded in Unicode and Foxit engine can't 70 // convert to Unicode, 71 // the return value will be zero. 72 // 73 FPDF_EXPORT unsigned int FPDF_CALLCONV 74 FPDFText_GetUnicode(FPDF_TEXTPAGE text_page, int index); 75 76 // Function: FPDFText_GetFontSize 77 // Get the font size of a particular character. 78 // Parameters: 79 // text_page - Handle to a text page information structure. 80 // Returned by FPDFText_LoadPage function. 81 // index - Zero-based index of the character. 82 // Return value: 83 // The font size of the particular character, measured in points (about 84 // 1/72 inch). 85 // This is the typographic size of the font (so called "em size"). 86 // 87 FPDF_EXPORT double FPDF_CALLCONV FPDFText_GetFontSize(FPDF_TEXTPAGE text_page, 88 int index); 89 90 // Function: FPDFText_GetCharBox 91 // Get bounding box of a particular character. 92 // Parameters: 93 // text_page - Handle to a text page information structure. 94 // Returned by FPDFText_LoadPage function. 95 // index - Zero-based index of the character. 96 // left - Pointer to a double number receiving left position 97 // of the character box. 98 // right - Pointer to a double number receiving right position 99 // of the character box. 100 // bottom - Pointer to a double number receiving bottom position 101 // of the character box. 102 // top - Pointer to a double number receiving top position of 103 // the character box. 104 // Return Value: 105 // On success, return TRUE and fill in |left|, |right|, |bottom|, and 106 // |top|. If |text_page| is invalid, or if |index| is out of bounds, 107 // then return FALSE, and the out parameters remain unmodified. 108 // Comments: 109 // All positions are measured in PDF "user space". 110 // 111 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetCharBox(FPDF_TEXTPAGE text_page, 112 int index, 113 double* left, 114 double* right, 115 double* bottom, 116 double* top); 117 118 // Function: FPDFText_GetCharOrigin 119 // Get origin of a particular character. 120 // Parameters: 121 // text_page - Handle to a text page information structure. 122 // Returned by FPDFText_LoadPage function. 123 // index - Zero-based index of the character. 124 // x - Pointer to a double number receiving x coordinate of 125 // the character origin. 126 // y - Pointer to a double number receiving y coordinate of 127 // the character origin. 128 // Return Value: 129 // Whether the call succeeded. If false, x and y are unchanged. 130 // Comments: 131 // All positions are measured in PDF "user space". 132 // 133 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV 134 FPDFText_GetCharOrigin(FPDF_TEXTPAGE text_page, 135 int index, 136 double* x, 137 double* y); 138 139 // Function: FPDFText_GetCharIndexAtPos 140 // Get the index of a character at or nearby a certain position on the 141 // page. 142 // Parameters: 143 // text_page - Handle to a text page information structure. 144 // Returned by FPDFText_LoadPage function. 145 // x - X position in PDF "user space". 146 // y - Y position in PDF "user space". 147 // xTolerance - An x-axis tolerance value for character hit 148 // detection, in point unit. 149 // yTolerance - A y-axis tolerance value for character hit 150 // detection, in point unit. 151 // Return Value: 152 // The zero-based index of the character at, or nearby the point (x,y). 153 // If there is no character at or nearby the point, return value will 154 // be -1. 155 // If an error occurs, -3 will be returned. 156 // 157 FPDF_EXPORT int FPDF_CALLCONV 158 FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page, 159 double x, 160 double y, 161 double xTolerance, 162 double yTolerance); 163 164 // Function: FPDFText_GetText 165 // Extract unicode text string from the page. 166 // Parameters: 167 // text_page - Handle to a text page information structure. 168 // Returned by FPDFText_LoadPage function. 169 // start_index - Index for the start characters. 170 // count - Number of characters to be extracted. 171 // result - A buffer (allocated by application) receiving the 172 // extracted unicodes. 173 // The size of the buffer must be able to hold the 174 // number of characters plus a terminator. 175 // Return Value: 176 // Number of characters written into the result buffer, including the 177 // trailing terminator. 178 // Comments: 179 // This function ignores characters without unicode information. 180 // 181 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetText(FPDF_TEXTPAGE text_page, 182 int start_index, 183 int count, 184 unsigned short* result); 185 186 // Function: FPDFText_CountRects 187 // Count number of rectangular areas occupied by a segment of texts. 188 // Parameters: 189 // text_page - Handle to a text page information structure. 190 // Returned by FPDFText_LoadPage function. 191 // start_index - Index for the start characters. 192 // count - Number of characters. 193 // Return value: 194 // Number of rectangles. Zero for error. 195 // Comments: 196 // This function, along with FPDFText_GetRect can be used by 197 // applications to detect the position 198 // on the page for a text segment, so proper areas can be highlighted 199 // or something. 200 // FPDFTEXT will automatically merge small character boxes into bigger 201 // one if those characters 202 // are on the same line and use same font settings. 203 // 204 FPDF_EXPORT int FPDF_CALLCONV FPDFText_CountRects(FPDF_TEXTPAGE text_page, 205 int start_index, 206 int count); 207 208 // Function: FPDFText_GetRect 209 // Get a rectangular area from the result generated by 210 // FPDFText_CountRects. 211 // Parameters: 212 // text_page - Handle to a text page information structure. 213 // Returned by FPDFText_LoadPage function. 214 // rect_index - Zero-based index for the rectangle. 215 // left - Pointer to a double value receiving the rectangle 216 // left boundary. 217 // top - Pointer to a double value receiving the rectangle 218 // top boundary. 219 // right - Pointer to a double value receiving the rectangle 220 // right boundary. 221 // bottom - Pointer to a double value receiving the rectangle 222 // bottom boundary. 223 // Return Value: 224 // On success, return TRUE and fill in |left|, |top|, |right|, and 225 // |bottom|. If |link_page| is invalid then return FALSE, and the out 226 // parameters remain unmodified. If |link_page| is valid but 227 // |link_index| is out of bounds, then return FALSE and set the out 228 // parameters to 0. 229 // 230 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_GetRect(FPDF_TEXTPAGE text_page, 231 int rect_index, 232 double* left, 233 double* top, 234 double* right, 235 double* bottom); 236 237 // Function: FPDFText_GetBoundedText 238 // Extract unicode text within a rectangular boundary on the page. 239 // Parameters: 240 // text_page - Handle to a text page information structure. 241 // Returned by FPDFText_LoadPage function. 242 // left - Left boundary. 243 // top - Top boundary. 244 // right - Right boundary. 245 // bottom - Bottom boundary. 246 // buffer - A unicode buffer. 247 // buflen - Number of characters (not bytes) for the buffer, 248 // excluding an additional terminator. 249 // Return Value: 250 // If buffer is NULL or buflen is zero, return number of characters 251 // (not bytes) of text present within 252 // the rectangle, excluding a terminating NUL. Generally you should 253 // pass a buffer at least one larger 254 // than this if you want a terminating NUL, which will be provided if 255 // space is available. 256 // Otherwise, return number of characters copied into the buffer, 257 // including the terminating NUL 258 // when space for it is available. 259 // Comment: 260 // If the buffer is too small, as much text as will fit is copied into 261 // it. 262 // 263 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page, 264 double left, 265 double top, 266 double right, 267 double bottom, 268 unsigned short* buffer, 269 int buflen); 270 271 // Flags used by FPDFText_FindStart function. 272 #define FPDF_MATCHCASE \ 273 0x00000001 // If not set, it will not match case by default. 274 #define FPDF_MATCHWHOLEWORD \ 275 0x00000002 // If not set, it will not match the whole word by default. 276 277 // Function: FPDFText_FindStart 278 // Start a search. 279 // Parameters: 280 // text_page - Handle to a text page information structure. 281 // Returned by FPDFText_LoadPage function. 282 // findwhat - A unicode match pattern. 283 // flags - Option flags. 284 // start_index - Start from this character. -1 for end of the page. 285 // Return Value: 286 // A handle for the search context. FPDFText_FindClose must be called 287 // to release this handle. 288 // 289 FPDF_EXPORT FPDF_SCHHANDLE FPDF_CALLCONV 290 FPDFText_FindStart(FPDF_TEXTPAGE text_page, 291 FPDF_WIDESTRING findwhat, 292 unsigned long flags, 293 int start_index); 294 295 // Function: FPDFText_FindNext 296 // Search in the direction from page start to end. 297 // Parameters: 298 // handle - A search context handle returned by 299 // FPDFText_FindStart. 300 // Return Value: 301 // Whether a match is found. 302 // 303 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindNext(FPDF_SCHHANDLE handle); 304 305 // Function: FPDFText_FindPrev 306 // Search in the direction from page end to start. 307 // Parameters: 308 // handle - A search context handle returned by 309 // FPDFText_FindStart. 310 // Return Value: 311 // Whether a match is found. 312 // 313 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFText_FindPrev(FPDF_SCHHANDLE handle); 314 315 // Function: FPDFText_GetSchResultIndex 316 // Get the starting character index of the search result. 317 // Parameters: 318 // handle - A search context handle returned by 319 // FPDFText_FindStart. 320 // Return Value: 321 // Index for the starting character. 322 // 323 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle); 324 325 // Function: FPDFText_GetSchCount 326 // Get the number of matched characters in the search result. 327 // Parameters: 328 // handle - A search context handle returned by 329 // FPDFText_FindStart. 330 // Return Value: 331 // Number of matched characters. 332 // 333 FPDF_EXPORT int FPDF_CALLCONV FPDFText_GetSchCount(FPDF_SCHHANDLE handle); 334 335 // Function: FPDFText_FindClose 336 // Release a search context. 337 // Parameters: 338 // handle - A search context handle returned by 339 // FPDFText_FindStart. 340 // Return Value: 341 // None. 342 // 343 FPDF_EXPORT void FPDF_CALLCONV FPDFText_FindClose(FPDF_SCHHANDLE handle); 344 345 // Function: FPDFLink_LoadWebLinks 346 // Prepare information about weblinks in a page. 347 // Parameters: 348 // text_page - Handle to a text page information structure. 349 // Returned by FPDFText_LoadPage function. 350 // Return Value: 351 // A handle to the page's links information structure. 352 // NULL if something goes wrong. 353 // Comments: 354 // Weblinks are those links implicitly embedded in PDF pages. PDF also 355 // has a type of 356 // annotation called "link", FPDFTEXT doesn't deal with that kind of 357 // link. 358 // FPDFTEXT weblink feature is useful for automatically detecting links 359 // in the page 360 // contents. For example, things like "http://www.foxitsoftware.com" 361 // will be detected, 362 // so applications can allow user to click on those characters to 363 // activate the link, 364 // even the PDF doesn't come with link annotations. 365 // 366 // FPDFLink_CloseWebLinks must be called to release resources. 367 // 368 FPDF_EXPORT FPDF_PAGELINK FPDF_CALLCONV 369 FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page); 370 371 // Function: FPDFLink_CountWebLinks 372 // Count number of detected web links. 373 // Parameters: 374 // link_page - Handle returned by FPDFLink_LoadWebLinks. 375 // Return Value: 376 // Number of detected web links. 377 // 378 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountWebLinks(FPDF_PAGELINK link_page); 379 380 // Function: FPDFLink_GetURL 381 // Fetch the URL information for a detected web link. 382 // Parameters: 383 // link_page - Handle returned by FPDFLink_LoadWebLinks. 384 // link_index - Zero-based index for the link. 385 // buffer - A unicode buffer for the result. 386 // buflen - Number of characters (not bytes) for the buffer, 387 // including an additional terminator. 388 // Return Value: 389 // If |buffer| is NULL or |buflen| is zero, return the number of 390 // characters (not bytes) needed to buffer the result (an additional 391 // terminator is included in this count). 392 // Otherwise, copy the result into |buffer|, truncating at |buflen| if 393 // the result is too large to fit, and return the number of characters 394 // actually copied into the buffer (the additional terminator is also 395 // included in this count). 396 // If |link_index| does not correspond to a valid link, then the result 397 // is an empty string. 398 // 399 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_GetURL(FPDF_PAGELINK link_page, 400 int link_index, 401 unsigned short* buffer, 402 int buflen); 403 404 // Function: FPDFLink_CountRects 405 // Count number of rectangular areas for the link. 406 // Parameters: 407 // link_page - Handle returned by FPDFLink_LoadWebLinks. 408 // link_index - Zero-based index for the link. 409 // Return Value: 410 // Number of rectangular areas for the link. If |link_index| does 411 // not correspond to a valid link, then 0 is returned. 412 // 413 FPDF_EXPORT int FPDF_CALLCONV FPDFLink_CountRects(FPDF_PAGELINK link_page, 414 int link_index); 415 416 // Function: FPDFLink_GetRect 417 // Fetch the boundaries of a rectangle for a link. 418 // Parameters: 419 // link_page - Handle returned by FPDFLink_LoadWebLinks. 420 // link_index - Zero-based index for the link. 421 // rect_index - Zero-based index for a rectangle. 422 // left - Pointer to a double value receiving the rectangle 423 // left boundary. 424 // top - Pointer to a double value receiving the rectangle 425 // top boundary. 426 // right - Pointer to a double value receiving the rectangle 427 // right boundary. 428 // bottom - Pointer to a double value receiving the rectangle 429 // bottom boundary. 430 // Return Value: 431 // On success, return TRUE and fill in |left|, |top|, |right|, and 432 // |bottom|. If |link_page| is invalid or if |link_index| does not 433 // correspond to a valid link, then return FALSE, and the out 434 // parameters remain unmodified. 435 // 436 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFLink_GetRect(FPDF_PAGELINK link_page, 437 int link_index, 438 int rect_index, 439 double* left, 440 double* top, 441 double* right, 442 double* bottom); 443 444 // Function: FPDFLink_CloseWebLinks 445 // Release resources used by weblink feature. 446 // Parameters: 447 // link_page - Handle returned by FPDFLink_LoadWebLinks. 448 // Return Value: 449 // None. 450 // 451 FPDF_EXPORT void FPDF_CALLCONV FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page); 452 453 #ifdef __cplusplus 454 } 455 #endif 456 457 #endif // PUBLIC_FPDF_TEXT_H_ 458