• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef PUBLIC_FPDF_TEXT_H_
8 #define PUBLIC_FPDF_TEXT_H_
9 
10 // NOLINTNEXTLINE(build/include)
11 #include "fpdfview.h"
12 
13 // Exported Functions
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 
18 // Function: FPDFText_LoadPage
19 //          Prepare information about all characters in a page.
20 // Parameters:
21 //          page    -   Handle to the page. Returned by FPDF_LoadPage function
22 //          (in FPDFVIEW module).
23 // Return value:
24 //          A handle to the text page information structure.
25 //          NULL if something goes wrong.
26 // Comments:
27 //          Application must call FPDFText_ClosePage to release the text page
28 //          information.
29 //
30 DLLEXPORT FPDF_TEXTPAGE STDCALL FPDFText_LoadPage(FPDF_PAGE page);
31 
32 // Function: FPDFText_ClosePage
33 //          Release all resources allocated for a text page information
34 //          structure.
35 // Parameters:
36 //          text_page   -   Handle to a text page information structure.
37 //          Returned by FPDFText_LoadPage function.
38 // Return Value:
39 //          None.
40 //
41 DLLEXPORT void STDCALL FPDFText_ClosePage(FPDF_TEXTPAGE text_page);
42 
43 // Function: FPDFText_CountChars
44 //          Get number of characters in a page.
45 // Parameters:
46 //          text_page   -   Handle to a text page information structure.
47 //          Returned by FPDFText_LoadPage function.
48 // Return value:
49 //          Number of characters in the page. Return -1 for error.
50 //          Generated characters, like additional space characters, new line
51 //          characters, are also counted.
52 // Comments:
53 //          Characters in a page form a "stream", inside the stream, each
54 //          character has an index.
55 //          We will use the index parameters in many of FPDFTEXT functions. The
56 //          first character in the page
57 //          has an index value of zero.
58 //
59 DLLEXPORT int STDCALL FPDFText_CountChars(FPDF_TEXTPAGE text_page);
60 
61 // Function: FPDFText_GetUnicode
62 //          Get Unicode of a character in a page.
63 // Parameters:
64 //          text_page   -   Handle to a text page information structure.
65 //          Returned by FPDFText_LoadPage function.
66 //          index       -   Zero-based index of the character.
67 // Return value:
68 //          The Unicode of the particular character.
69 //          If a character is not encoded in Unicode and Foxit engine can't
70 //          convert to Unicode,
71 //          the return value will be zero.
72 //
73 DLLEXPORT unsigned int STDCALL FPDFText_GetUnicode(FPDF_TEXTPAGE text_page,
74                                                    int index);
75 
76 // Function: FPDFText_GetFontSize
77 //          Get the font size of a particular character.
78 // Parameters:
79 //          text_page   -   Handle to a text page information structure.
80 //          Returned by FPDFText_LoadPage function.
81 //          index       -   Zero-based index of the character.
82 // Return value:
83 //          The font size of the particular character, measured in points (about
84 //          1/72 inch).
85 //          This is the typographic size of the font (so called "em size").
86 //
87 DLLEXPORT double STDCALL FPDFText_GetFontSize(FPDF_TEXTPAGE text_page,
88                                               int index);
89 
90 // Function: FPDFText_GetCharBox
91 //          Get bounding box of a particular character.
92 // Parameters:
93 //          text_page   -   Handle to a text page information structure.
94 //          Returned by FPDFText_LoadPage function.
95 //          index       -   Zero-based index of the character.
96 //          left        -   Pointer to a double number receiving left position
97 //          of the character box.
98 //          right       -   Pointer to a double number receiving right position
99 //          of the character box.
100 //          bottom      -   Pointer to a double number receiving bottom position
101 //          of the character box.
102 //          top         -   Pointer to a double number receiving top position of
103 //          the character box.
104 // Return Value:
105 //          None.
106 // Comments:
107 //          All positions are measured in PDF "user space".
108 //
109 DLLEXPORT void STDCALL FPDFText_GetCharBox(FPDF_TEXTPAGE text_page,
110                                            int index,
111                                            double* left,
112                                            double* right,
113                                            double* bottom,
114                                            double* top);
115 
116 // Function: FPDFText_GetCharIndexAtPos
117 //          Get the index of a character at or nearby a certain position on the
118 //          page.
119 // Parameters:
120 //          text_page   -   Handle to a text page information structure.
121 //          Returned by FPDFText_LoadPage function.
122 //          x           -   X position in PDF "user space".
123 //          y           -   Y position in PDF "user space".
124 //          xTolerance  -   An x-axis tolerance value for character hit
125 //          detection, in point unit.
126 //          yTolerance  -   A y-axis tolerance value for character hit
127 //          detection, in point unit.
128 // Return Value:
129 //          The zero-based index of the character at, or nearby the point (x,y).
130 //          If there is no character at or nearby the point, return value will
131 //          be -1.
132 //          If an error occurs, -3 will be returned.
133 //
134 DLLEXPORT int STDCALL FPDFText_GetCharIndexAtPos(FPDF_TEXTPAGE text_page,
135                                                  double x,
136                                                  double y,
137                                                  double xTolerance,
138                                                  double yTolerance);
139 
140 // Function: FPDFText_GetText
141 //          Extract unicode text string from the page.
142 // Parameters:
143 //          text_page   -   Handle to a text page information structure.
144 //          Returned by FPDFText_LoadPage function.
145 //          start_index -   Index for the start characters.
146 //          count       -   Number of characters to be extracted.
147 //          result      -   A buffer (allocated by application) receiving the
148 //          extracted unicodes.
149 //                          The size of the buffer must be able to hold the
150 //                          number of characters plus a terminator.
151 // Return Value:
152 //          Number of characters written into the result buffer, including the
153 //          trailing terminator.
154 // Comments:
155 //          This function ignores characters without unicode information.
156 //
157 DLLEXPORT int STDCALL FPDFText_GetText(FPDF_TEXTPAGE text_page,
158                                        int start_index,
159                                        int count,
160                                        unsigned short* result);
161 
162 // Function: FPDFText_CountRects
163 //          Count number of rectangular areas occupied by a segment of texts.
164 // Parameters:
165 //          text_page   -   Handle to a text page information structure.
166 //          Returned by FPDFText_LoadPage function.
167 //          start_index -   Index for the start characters.
168 //          count       -   Number of characters.
169 // Return value:
170 //          Number of rectangles. Zero for error.
171 // Comments:
172 //          This function, along with FPDFText_GetRect can be used by
173 //          applications to detect the position
174 //          on the page for a text segment, so proper areas can be highlighted
175 //          or something.
176 //          FPDFTEXT will automatically merge small character boxes into bigger
177 //          one if those characters
178 //          are on the same line and use same font settings.
179 //
180 DLLEXPORT int STDCALL FPDFText_CountRects(FPDF_TEXTPAGE text_page,
181                                           int start_index,
182                                           int count);
183 
184 // Function: FPDFText_GetRect
185 //          Get a rectangular area from the result generated by
186 //          FPDFText_CountRects.
187 // Parameters:
188 //          text_page   -   Handle to a text page information structure.
189 //          Returned by FPDFText_LoadPage function.
190 //          rect_index  -   Zero-based index for the rectangle.
191 //          left        -   Pointer to a double value receiving the rectangle
192 //          left boundary.
193 //          top         -   Pointer to a double value receiving the rectangle
194 //          top boundary.
195 //          right       -   Pointer to a double value receiving the rectangle
196 //          right boundary.
197 //          bottom      -   Pointer to a double value receiving the rectangle
198 //          bottom boundary.
199 // Return Value:
200 //          None.
201 //
202 DLLEXPORT void STDCALL FPDFText_GetRect(FPDF_TEXTPAGE text_page,
203                                         int rect_index,
204                                         double* left,
205                                         double* top,
206                                         double* right,
207                                         double* bottom);
208 
209 // Function: FPDFText_GetBoundedText
210 //          Extract unicode text within a rectangular boundary on the page.
211 // Parameters:
212 //          text_page   -   Handle to a text page information structure.
213 //          Returned by FPDFText_LoadPage function.
214 //          left        -   Left boundary.
215 //          top         -   Top boundary.
216 //          right       -   Right boundary.
217 //          bottom      -   Bottom boundary.
218 //          buffer      -   A unicode buffer.
219 //          buflen      -   Number of characters (not bytes) for the buffer,
220 //          excluding an additional terminator.
221 // Return Value:
222 //          If buffer is NULL or buflen is zero, return number of characters
223 //          (not bytes) of text present within
224 //          the rectangle, excluding a terminating NUL.  Generally you should
225 //          pass a buffer at least one larger
226 //          than this if you want a terminating NUL, which will be provided if
227 //          space is available.
228 //          Otherwise, return number of characters copied into the buffer,
229 //          including the terminating NUL
230 //          when space for it is available.
231 // Comment:
232 //          If the buffer is too small, as much text as will fit is copied into
233 //          it.
234 //
235 DLLEXPORT int STDCALL FPDFText_GetBoundedText(FPDF_TEXTPAGE text_page,
236                                               double left,
237                                               double top,
238                                               double right,
239                                               double bottom,
240                                               unsigned short* buffer,
241                                               int buflen);
242 
243 // Flags used by FPDFText_FindStart function.
244 #define FPDF_MATCHCASE \
245   0x00000001  // If not set, it will not match case by default.
246 #define FPDF_MATCHWHOLEWORD \
247   0x00000002  // If not set, it will not match the whole word by default.
248 
249 // Function: FPDFText_FindStart
250 //          Start a search.
251 // Parameters:
252 //          text_page   -   Handle to a text page information structure.
253 //          Returned by FPDFText_LoadPage function.
254 //          findwhat    -   A unicode match pattern.
255 //          flags       -   Option flags.
256 //          start_index -   Start from this character. -1 for end of the page.
257 // Return Value:
258 //          A handle for the search context. FPDFText_FindClose must be called
259 //          to release this handle.
260 //
261 DLLEXPORT FPDF_SCHHANDLE STDCALL FPDFText_FindStart(FPDF_TEXTPAGE text_page,
262                                                     FPDF_WIDESTRING findwhat,
263                                                     unsigned long flags,
264                                                     int start_index);
265 
266 // Function: FPDFText_FindNext
267 //          Search in the direction from page start to end.
268 // Parameters:
269 //          handle      -   A search context handle returned by
270 //          FPDFText_FindStart.
271 // Return Value:
272 //          Whether a match is found.
273 //
274 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindNext(FPDF_SCHHANDLE handle);
275 
276 // Function: FPDFText_FindPrev
277 //          Search in the direction from page end to start.
278 // Parameters:
279 //          handle      -   A search context handle returned by
280 //          FPDFText_FindStart.
281 // Return Value:
282 //          Whether a match is found.
283 //
284 DLLEXPORT FPDF_BOOL STDCALL FPDFText_FindPrev(FPDF_SCHHANDLE handle);
285 
286 // Function: FPDFText_GetSchResultIndex
287 //          Get the starting character index of the search result.
288 // Parameters:
289 //          handle      -   A search context handle returned by
290 //          FPDFText_FindStart.
291 // Return Value:
292 //          Index for the starting character.
293 //
294 DLLEXPORT int STDCALL FPDFText_GetSchResultIndex(FPDF_SCHHANDLE handle);
295 
296 // Function: FPDFText_GetSchCount
297 //          Get the number of matched characters in the search result.
298 // Parameters:
299 //          handle      -   A search context handle returned by
300 //          FPDFText_FindStart.
301 // Return Value:
302 //          Number of matched characters.
303 //
304 DLLEXPORT int STDCALL FPDFText_GetSchCount(FPDF_SCHHANDLE handle);
305 
306 // Function: FPDFText_FindClose
307 //          Release a search context.
308 // Parameters:
309 //          handle      -   A search context handle returned by
310 //          FPDFText_FindStart.
311 // Return Value:
312 //          None.
313 //
314 DLLEXPORT void STDCALL FPDFText_FindClose(FPDF_SCHHANDLE handle);
315 
316 // Function: FPDFLink_LoadWebLinks
317 //          Prepare information about weblinks in a page.
318 // Parameters:
319 //          text_page   -   Handle to a text page information structure.
320 //          Returned by FPDFText_LoadPage function.
321 // Return Value:
322 //          A handle to the page's links information structure.
323 //          NULL if something goes wrong.
324 // Comments:
325 //          Weblinks are those links implicitly embedded in PDF pages. PDF also
326 //          has a type of
327 //          annotation called "link", FPDFTEXT doesn't deal with that kind of
328 //          link.
329 //          FPDFTEXT weblink feature is useful for automatically detecting links
330 //          in the page
331 //          contents. For example, things like "http://www.foxitsoftware.com"
332 //          will be detected,
333 //          so applications can allow user to click on those characters to
334 //          activate the link,
335 //          even the PDF doesn't come with link annotations.
336 //
337 //          FPDFLink_CloseWebLinks must be called to release resources.
338 //
339 DLLEXPORT FPDF_PAGELINK STDCALL FPDFLink_LoadWebLinks(FPDF_TEXTPAGE text_page);
340 
341 // Function: FPDFLink_CountWebLinks
342 //          Count number of detected web links.
343 // Parameters:
344 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
345 // Return Value:
346 //          Number of detected web links.
347 //
348 DLLEXPORT int STDCALL FPDFLink_CountWebLinks(FPDF_PAGELINK link_page);
349 
350 // Function: FPDFLink_GetURL
351 //          Fetch the URL information for a detected web link.
352 // Parameters:
353 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
354 //          link_index  -   Zero-based index for the link.
355 //          buffer      -   A unicode buffer for the result.
356 //          buflen      -   Number of characters (not bytes) for the buffer,
357 //                          including an additional terminator.
358 // Return Value:
359 //          If |buffer| is NULL or |buflen| is zero, return the number of
360 //          characters (not bytes) needed to buffer the result (an additional
361 //          terminator is included in this count).
362 //          Otherwise, copy the result into |buffer|, truncating at |buflen| if
363 //          the result is too large to fit, and return the number of characters
364 //          actually copied into the buffer (the additional terminator is also
365 //          included in this count).
366 //          If |link_index| does not correspond to a valid link, then the result
367 //          is an empty string.
368 //
369 DLLEXPORT int STDCALL FPDFLink_GetURL(FPDF_PAGELINK link_page,
370                                       int link_index,
371                                       unsigned short* buffer,
372                                       int buflen);
373 
374 // Function: FPDFLink_CountRects
375 //          Count number of rectangular areas for the link.
376 // Parameters:
377 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
378 //          link_index  -   Zero-based index for the link.
379 // Return Value:
380 //          Number of rectangular areas for the link.  If |link_index| does
381 //          not correspond to a valid link, then 0 is returned.
382 //
383 DLLEXPORT int STDCALL FPDFLink_CountRects(FPDF_PAGELINK link_page,
384                                           int link_index);
385 
386 // Function: FPDFLink_GetRect
387 //          Fetch the boundaries of a rectangle for a link.
388 // Parameters:
389 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
390 //          link_index  -   Zero-based index for the link.
391 //          rect_index  -   Zero-based index for a rectangle.
392 //          left        -   Pointer to a double value receiving the rectangle
393 //                          left boundary.
394 //          top         -   Pointer to a double value receiving the rectangle
395 //                          top boundary.
396 //          right       -   Pointer to a double value receiving the rectangle
397 //                          right boundary.
398 //          bottom      -   Pointer to a double value receiving the rectangle
399 //                          bottom boundary.
400 // Return Value:
401 //          None.  If |link_index| does not correspond to a valid link, then
402 //          |left|, |top|, |right|, and |bottom| remain unmodified.
403 //
404 DLLEXPORT void STDCALL FPDFLink_GetRect(FPDF_PAGELINK link_page,
405                                         int link_index,
406                                         int rect_index,
407                                         double* left,
408                                         double* top,
409                                         double* right,
410                                         double* bottom);
411 
412 // Function: FPDFLink_CloseWebLinks
413 //          Release resources used by weblink feature.
414 // Parameters:
415 //          link_page   -   Handle returned by FPDFLink_LoadWebLinks.
416 // Return Value:
417 //          None.
418 //
419 DLLEXPORT void STDCALL FPDFLink_CloseWebLinks(FPDF_PAGELINK link_page);
420 
421 #ifdef __cplusplus
422 }
423 #endif
424 
425 #endif  // PUBLIC_FPDF_TEXT_H_
426