• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -  This software is distributed in the hope that it will be
4  -  useful, but with NO WARRANTY OF ANY KIND.
5  -  No author or distributor accepts responsibility to anyone for the
6  -  consequences of using this software, or for whether it serves any
7  -  particular purpose or works at all, unless he or she says so in
8  -  writing.  Everyone is granted permission to copy, modify and
9  -  redistribute this source code, for commercial or non-commercial
10  -  purposes, with the following restrictions: (1) the origin of this
11  -  source code must not be misrepresented; (2) modified versions must
12  -  be plainly marked as such; and (3) this notice may not be removed
13  -  or altered from any source or modified source distribution.
14  *====================================================================*/
15 
16 
17 /*
18  *   sarray.c
19  *
20  *      Create/Destroy/Copy
21  *          SARRAY    *sarrayCreate()
22  *          SARRAY    *sarrayCreateWordsFromString()
23  *          SARRAY    *sarrayCreateLinesFromString()
24  *          void      *sarrayDestroy()
25  *          SARRAY    *sarrayCopy()
26  *          SARRAY    *sarrayClone()
27  *
28  *      Add/Remove string
29  *          l_int32    sarrayAddString()
30  *          l_int32    sarrayExtendArray()
31  *          char      *sarrayRemoveString()
32  *          l_int32    sarrayClear()
33  *
34  *      Accessors
35  *          l_int32    sarrayGetCount()
36  *          char     **sarrayGetArray()
37  *          char      *sarrayGetString()
38  *          l_int32    sarrayGetRefcount()
39  *          l_int32    sarrayChangeRefcount()
40  *
41  *      Conversion back to string
42  *          char      *sarrayToString()
43  *          char      *sarrayToStringRange()
44  *
45  *      Concatenate 2 sarrays
46  *          l_int32    sarrayConcatenate()
47  *          l_int32    sarrayAppendRange()
48  *
49  *      Convert word sarray to (formatted) line sarray
50  *          SARRAY    *sarrayConvertWordsToLines()
51  *
52  *      Split string on separator list
53  *          SARRAY    *sarraySplitString()
54  *
55  *      Filter sarray
56  *          SARRAY    *sarraySelectBySubstring()
57  *          l_int32    sarrayParseRange()
58  *
59  *      Sort
60  *          SARRAY    *sarraySort()
61  *          l_int32    stringCompareLexical()
62  *
63  *      Serialize for I/O
64  *          SARRAY    *sarrayRead()
65  *          SARRAY    *sarrayReadStream()
66  *          l_int32    sarrayWrite()
67  *          l_int32    sarrayWriteStream()
68  *          l_int32    sarrayAppend()
69  *
70  *      Directory filenames
71  *          SARRAY    *getSortedPathnamesInDirectory()
72  *          SARRAY    *getFilenamesInDirectory()
73  *
74  *
75  *      Comments on usage:
76  *
77  *          These functions are important for efficient manipulation
78  *          of string data.  They have been used in leptonica for
79  *          generating and parsing text files, and for generating
80  *          code for compilation.  The user is responsible for
81  *          correctly disposing of strings that have been extracted
82  *          from sarrays.
83  *
84  *            - When you want a string from an Sarray to inspect it, or
85  *              plan to make a copy of it later, use sarrayGetString()
86  *              with copyflag = 0.  In this case, you must neither free
87  *              the string nor put it directly in another array.
88  *              We provide the copyflag constant L_NOCOPY, which is 0,
89  *              for this purpose:
90  *                 str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
91  *              To extract a copy of a string, use:
92  *                 str-owned = sarrayGetString(sa, index, L_COPY);
93  *
94  *            - When you want to insert a string that is in one
95  *              array into another array (always leaving the first
96  *              array intact), you have two options:
97  *                 (1) use copyflag = L_COPY to make an immediate copy,
98  *                     which you must then add to the second array
99  *                     by insertion; namely,
100  *                       str-owned = sarrayGetString(sa, index, L_COPY);
101  *                       sarrayAddString(sa, str-owned, L_INSERT);
102  *                 (2) use copyflag = L_NOCOPY to get another handle to
103  *                     the string, in which case you must add
104  *                     a copy of it to the second string array:
105  *                       str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
106  *                       sarrayAddString(sa, str-not-owned, L_COPY).
107  *
108  *              In all cases, when you use copyflag = L_COPY to extract
109  *              a string from an array, you must either free it
110  *              or insert it in an array that will be freed later.
111  */
112 
113 #include <stdio.h>
114 #include <string.h>
115 #include <stdlib.h>
116 #ifndef COMPILER_MSVC
117 #include <dirent.h>     /* unix only */
118 #endif  /* !COMPILER_MSVC */
119 #include "allheaders.h"
120 
121 static const l_int32  INITIAL_PTR_ARRAYSIZE = 50;     /* n'importe quoi */
122 static const l_int32  L_BUF_SIZE = 512;
123 
124 
125 /*--------------------------------------------------------------------------*
126  *                   String array create/destroy/copy/extend                *
127  *--------------------------------------------------------------------------*/
128 /*!
129  *  sarrayCreate()
130  *
131  *      Input:  size of string ptr array to be alloc'd
132  *              (use 0 for default)
133  *      Return: sarray, or null on error
134  */
135 SARRAY *
sarrayCreate(l_int32 n)136 sarrayCreate(l_int32  n)
137 {
138 SARRAY  *sa;
139 
140     PROCNAME("sarrayCreate");
141 
142     if (n <= 0)
143         n = INITIAL_PTR_ARRAYSIZE;
144 
145     if ((sa = (SARRAY *)CALLOC(1, sizeof(SARRAY))) == NULL)
146         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
147     if ((sa->array = (char **)CALLOC(n, sizeof(char *))) == NULL)
148         return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
149 
150     sa->nalloc = n;
151     sa->n = 0;
152     sa->refcount = 1;
153     return sa;
154 }
155 
156 
157 /*!
158  *  sarrayCreateWordsFromString()
159  *
160  *      Input:  string
161  *      Return: sarray, or null on error
162  *
163  *  Notes:
164  *      (1) This finds the number of word substrings, creates an sarray
165  *          of this size, and puts copies of each substring into the sarray.
166  */
167 SARRAY *
sarrayCreateWordsFromString(const char * string)168 sarrayCreateWordsFromString(const char  *string)
169 {
170 char     separators[] = " \n\t";
171 l_int32  i, nsub, size, inword;
172 SARRAY  *sa;
173 
174     PROCNAME("sarrayCreateWordsFromString");
175 
176     if (!string)
177         return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
178 
179         /* Find the number of words */
180     size = strlen(string);
181     nsub = 0;
182     inword = FALSE;
183     for (i = 0; i < size; i++) {
184         if (inword == FALSE &&
185            (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
186            inword = TRUE;
187            nsub++;
188         }
189         else if (inword == TRUE &&
190            (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
191            inword = FALSE;
192         }
193     }
194 
195     if ((sa = sarrayCreate(nsub)) == NULL)
196         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
197     sarraySplitString(sa, string, separators);
198 
199     return sa;
200 }
201 
202 
203 /*!
204  *  sarrayCreateLinesFromString()
205  *
206  *      Input:  string
207  *              blankflag  (0 to exclude blank lines; 1 to include)
208  *      Return: sarray, or null on error
209  *
210  *  Notes:
211  *      (1) This finds the number of line substrings, creates an sarray of
212  *          this size, and puts copies of each substring into the sarray.
213  */
214 SARRAY *
sarrayCreateLinesFromString(char * string,l_int32 blankflag)215 sarrayCreateLinesFromString(char    *string,
216                             l_int32  blankflag)
217 {
218 l_int32  i, nsub, size, startptr;
219 char    *cstring, *substring;
220 SARRAY  *sa;
221 
222     PROCNAME("sarrayCreateLinesFromString");
223 
224     if (!string)
225         return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
226 
227         /* find the number of lines */
228     size = strlen(string);
229     nsub = 0;
230     for (i = 0; i < size; i++) {
231         if (string[i] == '\n')
232             nsub++;
233     }
234 
235     if ((sa = sarrayCreate(nsub)) == NULL)
236         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
237 
238     if (blankflag) {  /* keep blank lines as null strings */
239             /* Make a copy for munging */
240         if ((cstring = stringNew(string)) == NULL)
241             return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
242             /* We'll insert nulls like strtok */
243         startptr = 0;
244         for (i = 0; i < size; i++) {
245             if (cstring[i] == '\n') {
246                 cstring[i] = '\0';
247                 if ((substring = stringNew(cstring + startptr)) == NULL)
248                     return (SARRAY *)ERROR_PTR("substring not made",
249                                                 procName, NULL);
250                 sarrayAddString(sa, substring, L_INSERT);
251 /*                fprintf(stderr, "substring = %s\n", substring); */
252                 startptr = i + 1;
253             }
254         }
255         if (startptr < size) {  /* no newline at end of last line */
256             if ((substring = stringNew(cstring + startptr)) == NULL)
257                 return (SARRAY *)ERROR_PTR("substring not made",
258                                             procName, NULL);
259             sarrayAddString(sa, substring, L_INSERT);
260 /*            fprintf(stderr, "substring = %s\n", substring); */
261         }
262         FREE(cstring);
263     }
264     else {  /* remove blank lines; use strtok */
265         sarraySplitString(sa, string, "\n");
266     }
267 
268     return sa;
269 }
270 
271 
272 /*!
273  *  sarrayDestroy()
274  *
275  *      Input:  &sarray <to be nulled>
276  *      Return: void
277  *
278  *  Notes:
279  *      (1) Decrements the ref count and, if 0, destroys the sarray.
280  *      (2) Always nulls the input ptr.
281  */
282 void
sarrayDestroy(SARRAY ** psa)283 sarrayDestroy(SARRAY  **psa)
284 {
285 l_int32  i;
286 SARRAY  *sa;
287 
288     PROCNAME("sarrayDestroy");
289 
290     if (psa == NULL) {
291         L_WARNING("ptr address is NULL!", procName);
292         return;
293     }
294     if ((sa = *psa) == NULL)
295         return;
296 
297     sarrayChangeRefcount(sa, -1);
298     if (sarrayGetRefcount(sa) <= 0) {
299         if (sa->array) {
300             for (i = 0; i < sa->n; i++)
301                 FREE(sa->array[i]);
302             FREE(sa->array);
303         }
304         FREE(sa);
305     }
306 
307     *psa = NULL;
308     return;
309 }
310 
311 
312 /*!
313  *  sarrayCopy()
314  *
315  *      Input:  sarray
316  *      Return: copy of sarray, or null on error
317  */
318 SARRAY *
sarrayCopy(SARRAY * sa)319 sarrayCopy(SARRAY  *sa)
320 {
321 l_int32  i;
322 SARRAY  *csa;
323 
324     PROCNAME("sarrayCopy");
325 
326     if (!sa)
327         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
328 
329     if ((csa = sarrayCreate(sa->nalloc)) == NULL)
330         return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);
331 
332     for (i = 0; i < sa->n; i++)
333         sarrayAddString(csa, sa->array[i], L_COPY);
334 
335     return csa;
336 }
337 
338 
339 /*!
340  *  sarrayClone()
341  *
342  *      Input:  sarray
343  *      Return: ptr to same sarray, or null on error
344  */
345 SARRAY *
sarrayClone(SARRAY * sa)346 sarrayClone(SARRAY  *sa)
347 {
348     PROCNAME("sarrayClone");
349 
350     if (!sa)
351         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
352     sarrayChangeRefcount(sa, 1);
353     return sa;
354 }
355 
356 
357 /*!
358  *  sarrayAddString()
359  *
360  *      Input:  sarray
361  *              string  (string to be added)
362  *              copyflag (L_INSERT, L_COPY)
363  *      Return: 0 if OK, 1 on error
364  *
365  *  Notes:
366  *      (1) Legacy usage decrees that we always use 0 to insert a string
367  *          directly and 1 to insert a copy of the string.  The
368  *          enums for L_INSERT and L_COPY agree with this convention,
369  *          and will not change in the future.
370  *      (2) See usage comments at the top of this file.
371  */
372 l_int32
sarrayAddString(SARRAY * sa,char * string,l_int32 copyflag)373 sarrayAddString(SARRAY  *sa,
374                 char    *string,
375                 l_int32  copyflag)
376 {
377 l_int32  n;
378 
379     PROCNAME("sarrayAddString");
380 
381     if (!sa)
382         return ERROR_INT("sa not defined", procName, 1);
383     if (!string)
384         return ERROR_INT("string not defined", procName, 1);
385     if (copyflag != L_INSERT && copyflag != L_COPY)
386         return ERROR_INT("invalid copyflag", procName, 1);
387 
388     n = sarrayGetCount(sa);
389     if (n >= sa->nalloc)
390         sarrayExtendArray(sa);
391 
392     if (copyflag == L_INSERT)
393         sa->array[n] = string;
394     else  /* L_COPY */
395         sa->array[n] = stringNew(string);
396     sa->n++;
397 
398     return 0;
399 }
400 
401 
402 /*!
403  *  sarrayExtendArray()
404  *
405  *      Input:  sarray
406  *      Return: 0 if OK, 1 on error
407  */
408 l_int32
sarrayExtendArray(SARRAY * sa)409 sarrayExtendArray(SARRAY  *sa)
410 {
411     PROCNAME("sarrayExtendArray");
412 
413     if (!sa)
414         return ERROR_INT("sa not defined", procName, 1);
415 
416     if ((sa->array = (char **)reallocNew((void **)&sa->array,
417                               sizeof(char *) * sa->nalloc,
418                               2 * sizeof(char *) * sa->nalloc)) == NULL)
419             return ERROR_INT("new ptr array not returned", procName, 1);
420 
421     sa->nalloc *= 2;
422     return 0;
423 }
424 
425 
426 /*!
427  *  sarrayRemoveString()
428  *
429  *      Input:  sarray
430  *              index (of string within sarray)
431  *      Return: removed string, or null on error
432  */
433 char *
sarrayRemoveString(SARRAY * sa,l_int32 index)434 sarrayRemoveString(SARRAY  *sa,
435                    l_int32  index)
436 {
437 char    *string;
438 char   **array;
439 l_int32  i, n, nalloc;
440 
441     PROCNAME("sarrayRemoveString");
442 
443     if (!sa)
444         return (char *)ERROR_PTR("sa not defined", procName, NULL);
445 
446     if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
447         return (char *)ERROR_PTR("array not returned", procName, NULL);
448 
449     if (index < 0 || index >= n)
450         return (char *)ERROR_PTR("array index out of bounds", procName, NULL);
451 
452     string = array[index];
453 
454         /* If removed string is not at end of array, shift
455          * to fill in, maintaining original ordering.
456          * Note: if we didn't care about the order, we could
457          * put the last string array[n - 1] directly into the hole.  */
458     for (i = index; i < n - 1; i++)
459         array[i] = array[i + 1];
460 
461     sa->n--;
462     return string;
463 }
464 
465 
466 /*!
467  *  sarrayClear()
468  *
469  *      Input:  sarray
470  *      Return: 0 if OK; 1 on error
471  */
472 l_int32
sarrayClear(SARRAY * sa)473 sarrayClear(SARRAY  *sa)
474 {
475 l_int32  i;
476 
477     PROCNAME("sarrayClear");
478 
479     if (!sa)
480         return ERROR_INT("sa not defined", procName, 1);
481     for (i = 0; i < sa->n; i++) {  /* free strings and null ptrs */
482         FREE(sa->array[i]);
483         sa->array[i] = NULL;
484     }
485     sa->n = 0;
486     return 0;
487 }
488 
489 
490 /*----------------------------------------------------------------------*
491  *                               Accessors                              *
492  *----------------------------------------------------------------------*/
493 /*!
494  *  sarrayGetCount()
495  *
496  *      Input:  sarray
497  *      Return: count, or 0 if no strings or on error
498  */
499 l_int32
sarrayGetCount(SARRAY * sa)500 sarrayGetCount(SARRAY  *sa)
501 {
502     PROCNAME("sarrayGetCount");
503 
504     if (!sa)
505         return ERROR_INT("sa not defined", procName, 0);
506     return sa->n;
507 }
508 
509 
510 /*!
511  *  sarrayGetArray()
512  *
513  *      Input:  sarray
514  *              &nalloc  (<optional return> number allocated string ptrs)
515  *              &n  (<optional return> number allocated strings)
516  *      Return: ptr to string array, or null on error
517  *
518  *  Notes:
519  *      (1) Caution: the returned array is not a copy, so caller
520  *          must not destroy it!
521  */
522 char **
sarrayGetArray(SARRAY * sa,l_int32 * pnalloc,l_int32 * pn)523 sarrayGetArray(SARRAY   *sa,
524                l_int32  *pnalloc,
525                l_int32  *pn)
526 {
527 char  **array;
528 
529     PROCNAME("sarrayGetArray");
530 
531     if (!sa)
532         return (char **)ERROR_PTR("sa not defined", procName, NULL);
533 
534     array = sa->array;
535     if (pnalloc) *pnalloc = sa->nalloc;
536     if (pn) *pn = sa->n;
537 
538     return array;
539 }
540 
541 
542 /*!
543  *  sarrayGetString()
544  *
545  *      Input:  sarray
546  *              index   (to the index-th string)
547  *              copyflag  (L_NOCOPY or L_COPY)
548  *      Return: string, or null on error
549  *
550  *  Notes:
551  *      (1) Legacy usage decrees that we always use 0 to get the
552  *          pointer to the string itself, and 1 to get a copy of
553  *          the string.
554  *      (2) See usage comments at the top of this file.
555  *      (3) To get a pointer to the string itself, use for copyflag:
556  *             L_NOCOPY or 0 or FALSE
557  *          To get a copy of the string, use for copyflag:
558  *             L_COPY or 1 or TRUE
559  *          The const values of L_NOCOPY and L_COPY are guaranteed not
560  *          to change.
561  */
562 char *
sarrayGetString(SARRAY * sa,l_int32 index,l_int32 copyflag)563 sarrayGetString(SARRAY  *sa,
564                 l_int32  index,
565                 l_int32  copyflag)
566 {
567     PROCNAME("sarrayGetString");
568 
569     if (!sa)
570         return (char *)ERROR_PTR("sa not defined", procName, NULL);
571     if (index < 0 || index >= sa->n)
572         return (char *)ERROR_PTR("index not valid", procName, NULL);
573     if (copyflag != L_NOCOPY && copyflag != L_COPY)
574         return (char *)ERROR_PTR("invalid copyflag", procName, NULL);
575 
576     if (copyflag == L_NOCOPY)
577         return sa->array[index];
578     else  /* L_COPY */
579         return stringNew(sa->array[index]);
580 }
581 
582 
583 /*!
584  *  sarrayGetRefCount()
585  *
586  *      Input:  sarray
587  *      Return: refcount, or UNDEF on error
588  */
589 l_int32
sarrayGetRefcount(SARRAY * sa)590 sarrayGetRefcount(SARRAY  *sa)
591 {
592     PROCNAME("sarrayGetRefcount");
593 
594     if (!sa)
595         return ERROR_INT("sa not defined", procName, UNDEF);
596     return sa->refcount;
597 }
598 
599 
600 /*!
601  *  sarrayChangeRefCount()
602  *
603  *      Input:  sarray
604  *              delta (change to be applied)
605  *      Return: 0 if OK, 1 on error
606  */
607 l_int32
sarrayChangeRefcount(SARRAY * sa,l_int32 delta)608 sarrayChangeRefcount(SARRAY  *sa,
609 		     l_int32  delta)
610 {
611     PROCNAME("sarrayChangeRefcount");
612 
613     if (!sa)
614         return ERROR_INT("sa not defined", procName, UNDEF);
615     sa->refcount += delta;
616     return 0;
617 }
618 
619 
620 /*----------------------------------------------------------------------*
621  *                      Conversion to string                           *
622  *----------------------------------------------------------------------*/
623 /*!
624  *  sarrayToString()
625  *
626  *      Input:  sarray
627  *              addnlflag (flag: 0 adds nothing to each substring
628  *                               1 adds '\n' to each substring
629  *                               2 adds ' ' to each substring)
630  *      Return: dest string, or null on error
631  *
632  *  Notes:
633  *      (1) Concatenates all the strings in the sarray, preserving
634  *          all white space.
635  *      (2) If addnlflag != 0, adds either a '\n' or a ' ' after
636  *          each substring.
637  *      (3) This function was NOT implemented as:
638  *            for (i = 0; i < n; i++)
639  *                     strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
640  *          Do you see why?
641  */
642 char *
sarrayToString(SARRAY * sa,l_int32 addnlflag)643 sarrayToString(SARRAY  *sa,
644                l_int32  addnlflag)
645 {
646     PROCNAME("sarrayToString");
647 
648     if (!sa)
649         return (char *)ERROR_PTR("sa not defined", procName, NULL);
650 
651     return sarrayToStringRange(sa, 0, 0, addnlflag);
652 }
653 
654 
655 /*!
656  *  sarrayToStringRange()
657  *
658  *      Input: sarray
659  *             first  (index of first string to use; starts with 0)
660  *             nstrings (number of strings to append into the result; use
661  *                       0 to append to the end of the sarray)
662  *             addnlflag (flag: 0 adds nothing to each substring
663  *                              1 adds '\n' to each substring
664  *                              2 adds ' ' to each substring)
665  *      Return: dest string, or null on error
666  *
667  *  Notes:
668  *      (1) Concatenates the specified strings inthe sarray, preserving
669  *          all white space.
670  *      (2) If addnlflag != 0, adds either a '\n' or a ' ' after
671  *          each substring.
672  *      (3) If the sarray is empty, this returns a string with just
673  *          the character corresponding to @addnlflag.
674  */
675 char *
sarrayToStringRange(SARRAY * sa,l_int32 first,l_int32 nstrings,l_int32 addnlflag)676 sarrayToStringRange(SARRAY  *sa,
677                     l_int32  first,
678                     l_int32  nstrings,
679                     l_int32  addnlflag)
680 {
681 char    *dest, *src;
682 l_int32  n, i, last, size, index, len;
683 
684     PROCNAME("sarrayToStringRange");
685 
686     if (!sa)
687         return (char *)ERROR_PTR("sa not defined", procName, NULL);
688     if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2)
689         return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);
690 
691     n = sarrayGetCount(sa);
692 
693         /* Empty sa; return char corresponding to addnlflag only */
694     if (n == 0) {
695         if (first == 0) {
696             if (addnlflag == 0)
697                 return stringNew("");
698             if (addnlflag == 1)
699                 return stringNew("\n");
700 	    else  /* addnlflag == 2) */
701                 return stringNew(" ");
702         }
703         else
704             return (char *)ERROR_PTR("first not valid", procName, NULL);
705     }
706 
707     if (first < 0 || first >= n)
708         return (char *)ERROR_PTR("first not valid", procName, NULL);
709     if (nstrings == 0 || (nstrings > n - first))
710         nstrings = n - first;  /* no overflow */
711     last = first + nstrings - 1;
712 
713     size = 0;
714     for (i = first; i <= last; i++)
715         size += strlen(sarrayGetString(sa, i, L_NOCOPY)) + 2;
716 
717     if ((dest = (char *)CALLOC(size + 1, sizeof(char))) == NULL)
718         return (char *)ERROR_PTR("dest not made", procName, NULL);
719 
720     index = 0;
721     for (i = first; i <= last; i++) {
722         src = sa->array[i];
723         len = strlen(src);
724         memcpy(dest + index, src, len);
725         index += len;
726         if (addnlflag == 1) {
727             dest[index] = '\n';
728             index++;
729         }
730         else if (addnlflag == 2) {
731             dest[index] = ' ';
732             index++;
733         }
734     }
735 
736     return dest;
737 }
738 
739 
740 /*----------------------------------------------------------------------*
741  *                      Concatenate 2 sarrays                           *
742  *----------------------------------------------------------------------*/
743 /*!
744  *  sarrayConcatenate()
745  *
746  *      Input:  sa1  (to be added to)
747  *              sa2  (append to sa1)
748  *      Return: 0 if OK, 1 on error
749  *
750  *  Notes:
751  *      (1) Copies of the strings in sarray2 are added to sarray1.
752  */
753 l_int32
sarrayConcatenate(SARRAY * sa1,SARRAY * sa2)754 sarrayConcatenate(SARRAY  *sa1,
755                   SARRAY  *sa2)
756 {
757 char    *str;
758 l_int32  n, i;
759 
760     PROCNAME("sarrayConcatenate");
761 
762     if (!sa1)
763         return ERROR_INT("sa1 not defined", procName, 1);
764     if (!sa2)
765         return ERROR_INT("sa2 not defined", procName, 1);
766 
767     n = sarrayGetCount(sa2);
768     for (i = 0; i < n; i++) {
769         str = sarrayGetString(sa2, i, L_NOCOPY);
770         sarrayAddString(sa1, str, L_COPY);
771     }
772 
773     return 0;
774 }
775 
776 
777 /*!
778  *  sarrayAppendRange()
779  *
780  *      Input:  sa1  (to be added to)
781  *              sa2  (append specified range of strings in sa2 to sa1)
782  *              start (index of first string of sa2 to append)
783  *              end (index of last string of sa2 to append)
784  *      Return: 0 if OK, 1 on error
785  *
786  *  Notes:
787  *      (1) Copies of the strings in sarray2 are added to sarray1.
788  *      (2) The [start ... end] range is truncated if necessary.
789  */
790 l_int32
sarrayAppendRange(SARRAY * sa1,SARRAY * sa2,l_int32 start,l_int32 end)791 sarrayAppendRange(SARRAY  *sa1,
792                   SARRAY  *sa2,
793 		  l_int32  start,
794 		  l_int32  end)
795 {
796 char    *str;
797 l_int32  n, i;
798 
799     PROCNAME("sarrayAppendRange");
800 
801     if (!sa1)
802         return ERROR_INT("sa1 not defined", procName, 1);
803     if (!sa2)
804         return ERROR_INT("sa2 not defined", procName, 1);
805     if (start < 0)
806         start = 0;
807     n = sarrayGetCount(sa2);
808     if (end >= n)
809         end = n - 1;
810     if (start > end)
811         return ERROR_INT("start > end", procName, 1);
812 
813     for (i = start; i <= end; i++) {
814         str = sarrayGetString(sa2, i, L_NOCOPY);
815         sarrayAddString(sa1, str, L_COPY);
816     }
817 
818     return 0;
819 }
820 
821 
822 /*----------------------------------------------------------------------*
823  *                   Convert word sarray to line sarray                 *
824  *----------------------------------------------------------------------*/
825 /*!
826  *  sarrayConvertWordsToLines()
827  *
828  *      Input:  sa  (sa of individual words)
829  *              linesize  (max num of chars in each line)
830  *      Return: saout (sa of formatted lines), or null on error
831  *
832  *  This is useful for re-typesetting text to a specific maximum
833  *  line length.  The individual words in the input sarray
834  *  are concatenated into textlines.  An input word string of zero
835  *  length is taken to be a paragraph separator.  Each time
836  *  such a string is found, the current line is ended and
837  *  a new line is also produced that contains just the
838  *  string of zero length ("").  When the output sarray
839  *  of lines is eventually converted to a string with newlines
840  *  (typically) appended to each line string, the empty
841  *  strings are just converted to newlines, producing the visible
842  *  paragraph separation.
843  *
844  *  What happens when a word is larger than linesize?
845  *  We write it out as a single line anyway!  Words preceding
846  *  or following this long word are placed on lines preceding
847  *  or following the line with the long word.  Why this choice?
848  *  Long "words" found in text documents are typically URLs, and
849  *  it's often desirable not to put newlines in the middle of a URL.
850  *  The text display program (e.g., text editor) will typically
851  *  wrap the long "word" to fit in the window.
852  */
853 SARRAY *
sarrayConvertWordsToLines(SARRAY * sa,l_int32 linesize)854 sarrayConvertWordsToLines(SARRAY  *sa,
855                           l_int32  linesize)
856 {
857 char    *wd, *strl;
858 char     emptystring[] = "";
859 l_int32  n, i, len, totlen;
860 SARRAY  *sal, *saout;
861 
862     PROCNAME("sarrayConvertWordsToLines");
863 
864     if (!sa)
865         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
866 
867     if ((saout = sarrayCreate(0)) == NULL)
868         return (SARRAY *)ERROR_PTR("saout not defined", procName, NULL);
869 
870     n = sarrayGetCount(sa);
871     totlen = 0;
872     sal = NULL;
873     for (i = 0; i < n; i++) {
874         if (!sal) {
875             if ((sal = sarrayCreate(0)) == NULL)
876                 return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
877         }
878         wd = sarrayGetString(sa, i, L_NOCOPY);
879         len = strlen(wd);
880         if (len == 0) {  /* end of paragraph: end line & insert blank line */
881             if (totlen > 0) {
882                 strl = sarrayToString(sal, 2);
883                 sarrayAddString(saout, strl, L_INSERT);
884             }
885             sarrayAddString(saout, emptystring, L_COPY);
886             sarrayDestroy(&sal);
887             totlen = 0;
888         }
889         else if (totlen == 0 && len + 1 > linesize) {  /* long word! */
890             sarrayAddString(saout, wd, L_COPY);  /* copy to one line */
891         }
892         else if (totlen + len + 1 > linesize) {  /* end line & start new one */
893             strl = sarrayToString(sal, 2);
894             sarrayAddString(saout, strl, L_INSERT);
895             sarrayDestroy(&sal);
896             if ((sal = sarrayCreate(0)) == NULL)
897                 return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
898             sarrayAddString(sal, wd, L_COPY);
899             totlen = len + 1;
900         }
901         else {   /* add to current line */
902             sarrayAddString(sal, wd, L_COPY);
903             totlen += len + 1;
904         }
905     }
906     if (totlen > 0) {   /* didn't end with blank line; output last line */
907         strl = sarrayToString(sal, 2);
908         sarrayAddString(saout, strl, L_INSERT);
909         sarrayDestroy(&sal);
910     }
911 
912     return saout;
913 
914 }
915 
916 
917 /*----------------------------------------------------------------------*
918  *                    Split string on separator list                    *
919  *----------------------------------------------------------------------*/
920 /*
921  *  sarraySplitString()
922  *
923  *      Input:  sa (to append to; typically empty initially)
924  *              str (string to split; not changed)
925  *              separators (characters that split input string)
926  *      Return: 0 if OK, 1 on error.
927  *
928  *  Notes:
929  *      (1) This uses strtokSafe().  See the notes there in utils.c.
930  */
931 l_int32
sarraySplitString(SARRAY * sa,const char * str,const char * separators)932 sarraySplitString(SARRAY      *sa,
933                   const char  *str,
934                   const char  *separators)
935 {
936 char  *cstr, *substr, *saveptr;
937 
938     PROCNAME("sarraySplitString");
939 
940     if (!sa)
941         return ERROR_INT("sa not defined", procName, 1);
942     if (!str)
943         return ERROR_INT("str not defined", procName, 1);
944     if (!separators)
945         return ERROR_INT("separators not defined", procName, 1);
946 
947     cstr = stringNew(str);  /* preserves const-ness of input str */
948     substr = strtokSafe(cstr, separators, &saveptr);
949     if (substr)
950         sarrayAddString(sa, substr, L_INSERT);
951     while ((substr = strtokSafe(NULL, separators, &saveptr)))
952         sarrayAddString(sa, substr, L_INSERT);
953     FREE(cstr);
954 
955     return 0;
956 }
957 
958 
959 /*----------------------------------------------------------------------*
960  *                              Filter sarray                           *
961  *----------------------------------------------------------------------*/
962 /*!
963  *  sarraySelectBySubstring()
964  *
965  *      Input:  sain (input sarray)
966  *              substr (<optional> substring for matching; can be NULL)
967  *      Return: saout (output sarray, filtered with substring) or null on error
968  *
969  *  Notes:
970  *      (1) This selects all strings in sain that have substr as a substring.
971  *          Note that we can't use strncmp() because we're looking for
972  *          a match to the substring anywhere within each filename.
973  *      (2) If substr == NULL, returns a copy of the sarray.
974  */
975 SARRAY *
sarraySelectBySubstring(SARRAY * sain,const char * substr)976 sarraySelectBySubstring(SARRAY      *sain,
977                         const char  *substr)
978 {
979 char    *str;
980 l_int32  n, i, offset, found;
981 SARRAY  *saout;
982 
983     PROCNAME("sarraySelectBySubstring");
984 
985     if (!sain)
986         return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
987 
988     n = sarrayGetCount(sain);
989     if (!substr || n == 0)
990         return sarrayCopy(sain);
991 
992     saout = sarrayCreate(n);
993     for (i = 0; i < n; i++) {
994         str = sarrayGetString(sain, i, L_NOCOPY);
995         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
996                           strlen(substr), &offset, &found);
997         if (found)
998             sarrayAddString(saout, str, L_COPY);
999     }
1000 
1001     return saout;
1002 }
1003 
1004 
1005 /*!
1006  *  sarrayParseRange()
1007  *
1008  *      Input:  sa (input sarray)
1009  *              start (index to start range search)
1010  *             &actualstart (<return> index of actual start; may be > 'start')
1011  *             &end (<return> index of end)
1012  *             &newstart (<return> index of start of next range)
1013  *              substr (substring for matching at beginning of string)
1014  *              loc (byte offset within the string for the pattern; use
1015  *                   -1 if the location does not matter);
1016  *      Return: 0 if valid range found; 1 otherwise
1017  *
1018  *  Notes:
1019  *      (1) This finds the range of the next set of strings in SA,
1020  *          beginning the search at 'start', that does NOT have
1021  *          the substring 'substr' either at the indicated location
1022  *          in the string or anywhere in the string.  The input
1023  *          variable 'loc' is the specified offset within the string;
1024  *          use -1 to indicate 'anywhere in the string'.
1025  *      (2) Always check the return value to verify that a valid range
1026  *          was found.
1027  *      (3) If a valid range is not found, the values of actstart,
1028  *          end and newstart are all set to the size of sa.
1029  *      (4) If this is the last valid range, newstart returns the value n.
1030  *          In use, this should be tested before calling the function.
1031  *      (5) Usage example.  To find all the valid ranges in a file
1032  *          where the invalid lines begin with two dashes, copy each
1033  *          line in the file to a string in an sarray, and do:
1034  *             start = 0;
1035  *             while (!sarrayParseRange(sa, start, &actstart, &end, &start,
1036  *                    "--", 0))
1037  *                 fprintf(stderr, "start = %d, end = %d\n", actstart, end);
1038  */
1039 l_int32
sarrayParseRange(SARRAY * sa,l_int32 start,l_int32 * pactualstart,l_int32 * pend,l_int32 * pnewstart,const char * substr,l_int32 loc)1040 sarrayParseRange(SARRAY      *sa,
1041                  l_int32      start,
1042                  l_int32     *pactualstart,
1043                  l_int32     *pend,
1044                  l_int32     *pnewstart,
1045                  const char  *substr,
1046 		 l_int32      loc)
1047 {
1048 char    *str;
1049 l_int32  n, i, offset, found;
1050 
1051     PROCNAME("sarrayParseRange");
1052 
1053     if (!sa)
1054         return ERROR_INT("sa not defined", procName, 1);
1055     if (!pactualstart || !pend || !pnewstart)
1056         return ERROR_INT("not all range addresses defined", procName, 1);
1057     n = sarrayGetCount(sa);
1058     *pactualstart = *pend = *pnewstart = n;
1059     if (!substr)
1060         return ERROR_INT("substr not defined", procName, 1);
1061 
1062         /* Look for the first string without the marker */
1063     if (start < 0 || start >= n)
1064         return 1;
1065     for (i = start; i < n; i++) {
1066         str = sarrayGetString(sa, i, L_NOCOPY);
1067         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1068                           strlen(substr), &offset, &found);
1069 	if (loc < 0) {
1070             if (!found) break;
1071 	} else {
1072             if (!found || offset != loc) break;
1073 	}
1074     }
1075     start = i;
1076     if (i == n)  /* couldn't get started */
1077         return 1;
1078 
1079         /* Look for the last string without the marker */
1080     *pactualstart = start;
1081     for (i = start + 1; i < n; i++) {
1082         str = sarrayGetString(sa, i, L_NOCOPY);
1083         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1084                           strlen(substr), &offset, &found);
1085 	if (loc < 0) {
1086             if (found) break;
1087 	} else {
1088             if (found && offset == loc) break;
1089 	}
1090     }
1091     *pend = i - 1;
1092     start = i;
1093     if (i == n)  /* no further range */
1094         return 0;
1095 
1096         /* Look for the first string after *pend without the marker.
1097          * This will start the next run of strings, if it exists. */
1098     for (i = start; i < n; i++) {
1099         str = sarrayGetString(sa, i, L_NOCOPY);
1100         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1101                           strlen(substr), &offset, &found);
1102 	if (loc < 0) {
1103             if (!found) break;
1104 	} else {
1105             if (!found || offset != loc) break;
1106 	}
1107     }
1108     if (i < n)
1109         *pnewstart = i;
1110 
1111     return 0;
1112 }
1113 
1114 
1115 /*----------------------------------------------------------------------*
1116  *                                   Sort                               *
1117  *----------------------------------------------------------------------*/
1118 /*!
1119  *  sarraySort()
1120  *
1121  *      Input:  saout (output sarray; can be NULL or equal to sain)
1122  *              sain (input sarray)
1123  *              sortorder (L_SORT_INCREASING or L_SORT_DECREASING)
1124  *      Return: saout (output sarray, sorted by ascii value), or null on error
1125  *
1126  *  Notes:
1127  *      (1) Set saout = sain for in-place; otherwise, set naout = NULL.
1128  *      (2) Shell sort, modified from K&R, 2nd edition, p.62.
1129  *          Slow but simple O(n logn) sort.
1130  */
1131 SARRAY *
sarraySort(SARRAY * saout,SARRAY * sain,l_int32 sortorder)1132 sarraySort(SARRAY  *saout,
1133            SARRAY  *sain,
1134            l_int32  sortorder)
1135 {
1136 char   **array;
1137 char    *tmp;
1138 l_int32  n, i, j, gap;
1139 
1140     PROCNAME("sarraySort");
1141 
1142     if (!sain)
1143         return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
1144 
1145         /* Make saout if necessary; otherwise do in-place */
1146     if (!saout)
1147         saout = sarrayCopy(sain);
1148     else if (sain != saout)
1149         return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL);
1150     array = saout->array;  /* operate directly on the array */
1151     n = sarrayGetCount(saout);
1152 
1153         /* Shell sort */
1154     for (gap = n/2; gap > 0; gap = gap / 2) {
1155         for (i = gap; i < n; i++) {
1156             for (j = i - gap; j >= 0; j -= gap) {
1157                 if ((sortorder == L_SORT_INCREASING &&
1158                      stringCompareLexical(array[j], array[j + gap])) ||
1159                     (sortorder == L_SORT_DECREASING &&
1160                      stringCompareLexical(array[j + gap], array[j])))
1161                 {
1162                     tmp = array[j];
1163                     array[j] = array[j + gap];
1164                     array[j + gap] = tmp;
1165                 }
1166             }
1167         }
1168     }
1169 
1170     return saout;
1171 }
1172 
1173 
1174 /*!
1175  *  stringCompareLexical()
1176  *
1177  *      Input:  str1
1178  *              str2
1179  *      Return: 1 if str1 > str2 (lexically); 0 otherwise
1180  *
1181  *  Notes:
1182  *      (1) If the lexical values are identical, return a 0, to
1183  *          indicate that no swapping is required to sort the strings.
1184  */
1185 l_int32
stringCompareLexical(const char * str1,const char * str2)1186 stringCompareLexical(const char *str1,
1187                      const char *str2)
1188 {
1189 l_int32  i, len1, len2, len;
1190 
1191     PROCNAME("sarrayCompareLexical");
1192 
1193     if (!str1)
1194         return ERROR_INT("str1 not defined", procName, 1);
1195     if (!str2)
1196         return ERROR_INT("str2 not defined", procName, 1);
1197 
1198     len1 = strlen(str1);
1199     len2 = strlen(str2);
1200     len = L_MIN(len1, len2);
1201 
1202     for (i = 0; i < len; i++) {
1203         if (str1[i] == str2[i])
1204             continue;
1205         if (str1[i] > str2[i])
1206             return 1;
1207         else
1208             return 0;
1209     }
1210 
1211     if (len1 > len2)
1212         return 1;
1213     else
1214         return 0;
1215 }
1216 
1217 
1218 /*----------------------------------------------------------------------*
1219  *                           Serialize for I/O                          *
1220  *----------------------------------------------------------------------*/
1221 /*!
1222  *  sarrayRead()
1223  *
1224  *      Input:  filename
1225  *      Return: sarray, or null on error
1226  */
1227 SARRAY *
sarrayRead(const char * filename)1228 sarrayRead(const char  *filename)
1229 {
1230 FILE    *fp;
1231 SARRAY  *sa;
1232 
1233     PROCNAME("sarrayRead");
1234 
1235     if (!filename)
1236         return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);
1237 
1238     if ((fp = fopenReadStream(filename)) == NULL)
1239         return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
1240 
1241     if ((sa = sarrayReadStream(fp)) == NULL) {
1242         fclose(fp);
1243         return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
1244     }
1245 
1246     fclose(fp);
1247     return sa;
1248 }
1249 
1250 
1251 /*!
1252  *  sarrayReadStream()
1253  *
1254  *      Input:  stream
1255  *      Return: sarray, or null on error
1256  *
1257  *  Notes:
1258  *      (1) We store the size of each string along with the string.
1259  *      (2) This allows a string to have embedded newlines.  By reading
1260  *          the entire string, as determined by its size, we are
1261  *          not affected by any number of embedded newlines.
1262  */
1263 SARRAY *
sarrayReadStream(FILE * fp)1264 sarrayReadStream(FILE  *fp)
1265 {
1266 char    *stringbuf;
1267 l_int32  i, n, size, index, bufsize, ret, version;
1268 SARRAY  *sa;
1269 
1270     PROCNAME("sarrayReadStream");
1271 
1272     if (!fp)
1273         return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);
1274 
1275     ret = fscanf(fp, "\nSarray Version %d\n", &version);
1276     if (ret != 1)
1277         return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
1278     if (version != SARRAY_VERSION_NUMBER)
1279         return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
1280     fscanf(fp, "Number of strings = %d\n", &n);
1281 
1282     if ((sa = sarrayCreate(n)) == NULL)
1283         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1284     bufsize = L_BUF_SIZE + 1;
1285     if ((stringbuf = (char *)CALLOC(bufsize, sizeof(char))) == NULL)
1286         return (SARRAY *)ERROR_PTR("stringbuf not made", procName, NULL);
1287 
1288     for (i = 0; i < n; i++) {
1289 	    /* Get the size of the stored string */
1290         fscanf(fp, "%d[%d]:", &index, &size);
1291 	    /* Expand the string buffer if necessary */
1292 	if (size > bufsize - 5) {
1293             FREE(stringbuf);
1294 	    bufsize = (l_int32)(1.5 * size);
1295             stringbuf = (char *)CALLOC(bufsize, sizeof(char));
1296 	}
1297 	    /* Read the stored string, plus leading spaces and trailing \n */
1298 	fread(stringbuf, 1, size + 3, fp);
1299 	    /* Remove the \n that was added by sarrayWriteStream() */
1300 	stringbuf[size + 2] = '\0';
1301 	    /* Copy it in, skipping the 2 leading spaces */
1302         sarrayAddString(sa, stringbuf + 2, L_COPY);
1303     }
1304     fscanf(fp, "\n");
1305 
1306     FREE(stringbuf);
1307     return sa;
1308 }
1309 
1310 
1311 /*!
1312  *  sarrayWrite()
1313  *
1314  *      Input:  filename
1315  *              sarray
1316  *      Return: 0 if OK; 1 on error
1317  */
1318 l_int32
sarrayWrite(const char * filename,SARRAY * sa)1319 sarrayWrite(const char  *filename,
1320             SARRAY      *sa)
1321 {
1322 FILE  *fp;
1323 
1324     PROCNAME("sarrayWrite");
1325 
1326     if (!filename)
1327         return ERROR_INT("filename not defined", procName, 1);
1328     if (!sa)
1329         return ERROR_INT("sa not defined", procName, 1);
1330 
1331     if ((fp = fopen(filename, "w")) == NULL)
1332         return ERROR_INT("stream not opened", procName, 1);
1333 
1334     if (sarrayWriteStream(fp, sa))
1335         return ERROR_INT("sa not written to stream", procName, 1);
1336 
1337     fclose(fp);
1338     return 0;
1339 }
1340 
1341 
1342 /*!
1343  *  sarrayWriteStream()
1344  *
1345  *      Input:  stream
1346  *              sarray
1347  *      Returns 0 if OK; 1 on error
1348  *
1349  *  Notes:
1350  *      (1) This appends a '\n' to each string, which is stripped
1351  *          off by sarrayReadStream().
1352  */
1353 l_int32
sarrayWriteStream(FILE * fp,SARRAY * sa)1354 sarrayWriteStream(FILE    *fp,
1355                   SARRAY  *sa)
1356 {
1357 l_int32  i, n, len;
1358 
1359     PROCNAME("sarrayWriteStream");
1360 
1361     if (!fp)
1362         return ERROR_INT("stream not defined", procName, 1);
1363     if (!sa)
1364         return ERROR_INT("sa not defined", procName, 1);
1365 
1366     n = sarrayGetCount(sa);
1367     fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
1368     fprintf(fp, "Number of strings = %d\n", n);
1369     for (i = 0; i < n; i++) {
1370         len = strlen(sa->array[i]);
1371         fprintf(fp, "  %d[%d]:  %s\n", i, len, sa->array[i]);
1372     }
1373     fprintf(fp, "\n");
1374 
1375     return 0;
1376 }
1377 
1378 
1379 /*!
1380  *  sarrayAppend()
1381  *
1382  *      Input:  filename
1383  *              sarray
1384  *      Return: 0 if OK; 1 on error
1385  */
1386 l_int32
sarrayAppend(const char * filename,SARRAY * sa)1387 sarrayAppend(const char  *filename,
1388              SARRAY      *sa)
1389 {
1390 FILE  *fp;
1391 
1392     PROCNAME("sarrayAppend");
1393 
1394     if (!filename)
1395         return ERROR_INT("filename not defined", procName, 1);
1396     if (!sa)
1397         return ERROR_INT("sa not defined", procName, 1);
1398 
1399     if ((fp = fopen(filename, "a")) == NULL)
1400         return ERROR_INT("stream not opened", procName, 1);
1401 
1402     if (sarrayWriteStream(fp, sa))
1403         return ERROR_INT("sa not appended to stream", procName, 1);
1404 
1405     fclose(fp);
1406     return 0;
1407 }
1408 
1409 
1410 /*---------------------------------------------------------------------*
1411  *                           Directory filenames                       *
1412  *---------------------------------------------------------------------*/
1413 /*!
1414  *  getSortedPathnamesInDirectory()
1415  *
1416  *      Input:  directory name
1417  *              substr (<optional> substring filter on filenames; can be NULL)
1418  *              firstpage (0-based)
1419  *              npages (use 0 for all to the end)
1420  *      Return: sarray of sorted pathnames, or NULL on error
1421  *
1422  *  Notes:
1423  *      (1) If 'substr' is not NULL, only filenames that contain
1424  *          the substring can be returned.  If 'substr' is NULL,
1425  *          none of the filenames are filtered out.
1426  *      (2) The files in the directory, after optional filtering by
1427  *          the substring, are lexically sorted in increasing order.
1428  *          The full pathnames are returned for the requested sequence.
1429  *          If no files are found after filtering, returns an empty sarray.
1430  */
1431 SARRAY *
getSortedPathnamesInDirectory(const char * dirname,const char * substr,l_int32 firstpage,l_int32 npages)1432 getSortedPathnamesInDirectory(const char  *dirname,
1433                               const char  *substr,
1434                               l_int32      firstpage,
1435                               l_int32      npages)
1436 {
1437 char    *fname, *fullname;
1438 l_int32  i, nfiles, lastpage;
1439 SARRAY  *sa, *safiles, *saout;
1440 
1441     PROCNAME("getSortedPathnamesInDirectory");
1442 
1443     if (!dirname)
1444         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1445 
1446     if ((sa = getFilenamesInDirectory(dirname)) == NULL)
1447         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1448     safiles = sarraySelectBySubstring(sa, substr);
1449     sarrayDestroy(&sa);
1450     nfiles = sarrayGetCount(safiles);
1451     if (nfiles == 0) {
1452         L_WARNING("no files found", procName);
1453         return safiles;
1454     }
1455 
1456     sarraySort(safiles, safiles, L_SORT_INCREASING);
1457 
1458     firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1);
1459     if (npages == 0)
1460         npages = nfiles - firstpage;
1461     lastpage = L_MIN(firstpage + npages - 1, nfiles - 1);
1462 
1463     saout = sarrayCreate(lastpage - firstpage + 1);
1464     for (i = firstpage; i <= lastpage; i++) {
1465         fname = sarrayGetString(safiles, i, L_NOCOPY);
1466         fullname = genPathname(dirname, fname);
1467         sarrayAddString(saout, fullname, L_INSERT);
1468     }
1469 
1470     sarrayDestroy(&safiles);
1471     return saout;
1472 }
1473 
1474 
1475 /*!
1476  *  getFilenamesInDirectory()
1477  *
1478  *      Input:  directory name
1479  *      Return: sarray of file names, or NULL on error
1480  *
1481  *  Notes:
1482  *      (1) The versions compiled under unix and cygwin use the POSIX C
1483  *          library commands for handling directories.  For windows,
1484  *          there is a separate implementation.
1485  *      (2) It returns an array of filename tails; i.e., only the part of
1486  *          the path after the last slash.
1487  *      (3) Use of the d_type field of dirent is not portable:
1488  *          "According to POSIX, the dirent structure contains a field
1489  *          char d_name[] of unspecified size, with at most NAME_MAX
1490  *          characters preceding the terminating null character.  Use
1491  *          of other fields will harm the portability of your programs."
1492  *      (4) As a consequence of (3), we note several things:
1493  *           - MINGW doesn't have a d_type member.
1494  *           - Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
1495  *             for d_type from all files.
1496  *          On these systems, this function will return directories
1497  *          (except for '.' and '..', which are eliminated using
1498  *          the d_name field).
1499  */
1500 #ifndef COMPILER_MSVC
1501 
1502 SARRAY *
getFilenamesInDirectory(const char * dirname)1503 getFilenamesInDirectory(const char  *dirname)
1504 {
1505 char           *name;
1506 l_int32         len;
1507 SARRAY         *safiles;
1508 DIR            *pdir;
1509 struct dirent  *pdirentry;
1510 
1511     PROCNAME("getFilenamesInDirectory");
1512 
1513     if (!dirname)
1514         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1515 
1516     if ((safiles = sarrayCreate(0)) == NULL)
1517         return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
1518     if ((pdir = opendir(dirname)) == NULL)
1519         return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
1520     while ((pdirentry = readdir(pdir)))  {
1521 
1522         /* It's nice to ignore directories.  For this it is necessary to
1523          * define _BSD_SOURCE in the CC command, because the DT_DIR
1524          * flag is non-standard.  */
1525 #if !defined(__MINGW32__) && !defined(_CYGWIN_ENVIRON) && !defined(__SOLARIS__)
1526         if (pdirentry->d_type == DT_DIR)
1527             continue;
1528 #endif
1529 
1530             /* Filter out "." and ".." if they're passed through */
1531         name = pdirentry->d_name;
1532         len = strlen(name);
1533         if (len == 1 && name[len - 1] == '.') continue;
1534         if (len == 2 && name[len - 1] == '.' && name[len - 2] == '.') continue;
1535         sarrayAddString(safiles, name, L_COPY);
1536     }
1537     closedir(pdir);
1538 
1539     return safiles;
1540 }
1541 
1542 #else  /* COMPILER_MSVC */
1543 
1544     /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
1545 #include <windows.h>
1546 #include <tchar.h>
1547 SARRAY *
getFilenamesInDirectory(const char * dirname)1548 getFilenamesInDirectory(const char  *dirname)
1549 {
1550 SARRAY           *safiles;
1551 WIN32_FIND_DATAA  ffd;
1552 size_t            length_of_path;
1553 CHAR              szDir[MAX_PATH];  /* MAX_PATH is defined in stdlib.h */
1554 HANDLE            hFind = INVALID_HANDLE_VALUE;
1555 
1556     PROCNAME("getFilenamesInDirectory");
1557 
1558     if (!dirname)
1559         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1560 
1561     length_of_path = strlen(dirname);
1562     if (length_of_path > (MAX_PATH - 2))
1563         return (SARRAY *)ERROR_PTR("dirname is to long", procName, NULL);
1564 
1565     strncpy(szDir, dirname, MAX_PATH);
1566     szDir[MAX_PATH - 1] = '\0';
1567     strncat(szDir, TEXT("\\*"), MAX_PATH - strlen(szDir));
1568 
1569     if ((safiles = sarrayCreate(0)) == NULL)
1570         return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
1571     hFind = FindFirstFileA(szDir, &ffd);
1572     if (INVALID_HANDLE_VALUE == hFind) {
1573         sarrayDestroy(&safiles);
1574         return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
1575     }
1576 
1577     while (FindNextFileA(hFind, &ffd) != 0) {
1578         if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)  /* skip dirs */
1579             continue;
1580         sarrayAddString(safiles, ffd.cFileName, L_COPY);
1581     }
1582 
1583     FindClose(hFind);
1584     return safiles;
1585 }
1586 
1587 #endif  /* COMPILER_MSVC */
1588 
1589