• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2  * licenses follows.
3  */
4 
5 /* LibHnj - a library for high quality hyphenation and justification
6  * Copyright (C) 1998 Raph Levien,
7  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9  *           (C) 2006, 2007, 2008 László Németh (nemeth at OOo)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Library General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Library General Public License for more details.
20  *
21  * You should have received a copy of the GNU Library General Public
22  * License along with this library; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA  02111-1307  USA.
25  */
26 
27 /*
28  * The contents of this file are subject to the Mozilla Public License
29  * Version 1.0 (the "MPL"); you may not use this file except in
30  * compliance with the MPL.  You may obtain a copy of the MPL at
31  * http://www.mozilla.org/MPL/
32  *
33  * Software distributed under the MPL is distributed on an "AS IS" basis,
34  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35  * for the specific language governing rights and limitations under the
36  * MPL.
37  *
38  */
39 #include <fcntl.h>
40 #include <sys/mman.h>
41 #include <sys/stat.h>
42 #include <stdlib.h> /* for NULL, malloc */
43 #include <stdio.h>  /* for fprintf */
44 #include <string.h> /* for strdup */
45 #include <unistd.h> /* for close */
46 
47 #define noVERBOSE
48 
49 #include "hnjalloc.h"
50 #include "hyphen.h"
51 
52 static char *
hnj_strdup(const char * s)53 hnj_strdup (const char *s)
54 {
55     char *new;
56     int l;
57 
58     l = strlen (s);
59     new = hnj_malloc (l + 1);
60     memcpy (new, s, l);
61     new[l] = 0;
62     return new;
63 }
64 
65 /* remove cross-platform text line end characters */
hnj_strchomp(char * s)66 void hnj_strchomp(char * s)
67 {
68     int k = strlen(s);
69     if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
70     if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
71 }
72 
73 /* a little bit of a hash table implementation. This simply maps strings
74    to state numbers */
75 
76 typedef struct _HashTab HashTab;
77 typedef struct _HashEntry HashEntry;
78 
79 /* A cheap, but effective, hack. */
80 #define HASH_SIZE 31627
81 
82 struct _HashTab {
83     HashEntry *entries[HASH_SIZE];
84 };
85 
86 struct _HashEntry {
87     HashEntry *next;
88     char *key;
89     int val;
90 };
91 
92 /* a char* hash function from ASU - adapted from Gtk+ */
93 static unsigned int
hnj_string_hash(const char * s)94 hnj_string_hash (const char *s)
95 {
96     const char *p;
97     unsigned int h=0, g;
98     for(p = s; *p != '\0'; p += 1) {
99         h = ( h << 4 ) + *p;
100         if ( ( g = h & 0xf0000000 ) ) {
101             h = h ^ (g >> 24);
102             h = h ^ g;
103         }
104     }
105     return h /* % M */;
106 }
107 
108 static HashTab *
hnj_hash_new(void)109 hnj_hash_new (void)
110 {
111     HashTab *hashtab;
112     int i;
113 
114     hashtab = hnj_malloc (sizeof(HashTab));
115     for (i = 0; i < HASH_SIZE; i++)
116         hashtab->entries[i] = NULL;
117 
118     return hashtab;
119 }
120 
121 static void
hnj_hash_free(HashTab * hashtab)122 hnj_hash_free (HashTab *hashtab)
123 {
124     int i;
125     HashEntry *e, *next;
126 
127     for (i = 0; i < HASH_SIZE; i++)
128         for (e = hashtab->entries[i]; e; e = next)
129         {
130             next = e->next;
131             hnj_free (e->key);
132             hnj_free (e);
133         }
134 
135     hnj_free (hashtab);
136 }
137 
138 /* assumes that key is not already present! */
139 static void
hnj_hash_insert(HashTab * hashtab,const char * key,int val)140 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
141 {
142     int i;
143     HashEntry *e;
144 
145     i = hnj_string_hash (key) % HASH_SIZE;
146     e = hnj_malloc (sizeof(HashEntry));
147     e->next = hashtab->entries[i];
148     e->key = hnj_strdup (key);
149     e->val = val;
150     hashtab->entries[i] = e;
151 }
152 
153 /* return val if found, otherwise -1 */
154 static int
hnj_hash_lookup(HashTab * hashtab,const char * key)155 hnj_hash_lookup (HashTab *hashtab, const char *key)
156 {
157     int i;
158     HashEntry *e;
159     i = hnj_string_hash (key) % HASH_SIZE;
160     for (e = hashtab->entries[i]; e; e = e->next)
161         if (!strcmp (key, e->key))
162             return e->val;
163     return -1;
164 }
165 
166 /* Get the state number, allocating a new state if necessary. */
167 static int
hnj_get_state(HyphenDict * dict,HashTab * hashtab,const char * string)168 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
169 {
170     int state_num;
171 
172     state_num = hnj_hash_lookup (hashtab, string);
173 
174     if (state_num >= 0)
175         return state_num;
176 
177     hnj_hash_insert (hashtab, string, dict->num_states);
178     /* predicate is true if dict->num_states is a power of two */
179     if (!(dict->num_states & (dict->num_states - 1)))
180     {
181         dict->states = hnj_realloc (dict->states,
182             (dict->num_states << 1) *
183             sizeof(HyphenState));
184     }
185     dict->states[dict->num_states].match = NULL;
186     dict->states[dict->num_states].repl = NULL;
187     dict->states[dict->num_states].fallback_state = -1;
188     dict->states[dict->num_states].num_trans = 0;
189     dict->states[dict->num_states].trans = NULL;
190     return dict->num_states++;
191 }
192 
193 /* add a transition from state1 to state2 through ch - assumes that the
194    transition does not already exist */
195 static void
hnj_add_trans(HyphenDict * dict,int state1,int state2,char ch)196 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
197 {
198     int num_trans;
199 
200     num_trans = dict->states[state1].num_trans;
201     if (num_trans == 0)
202     {
203         dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
204     }
205     else if (!(num_trans & (num_trans - 1)))
206     {
207         dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
208             (num_trans << 1) *
209             sizeof(HyphenTrans));
210     }
211     dict->states[state1].trans[num_trans].ch = ch;
212     dict->states[state1].trans[num_trans].new_state = state2;
213     dict->states[state1].num_trans++;
214 }
215 
216 #ifdef VERBOSE
217 HashTab *global;
218 
219 static char *
get_state_str(int state)220 get_state_str (int state)
221 {
222     int i;
223     HashEntry *e;
224 
225     for (i = 0; i < HASH_SIZE; i++)
226         for (e = global->entries[i]; e; e = e->next)
227             if (e->val == state)
228                 return e->key;
229     return NULL;
230 }
231 #endif
232 
233 // Get a line from the dictionary contents.
234 static char *
get_line(char * s,int size,const char * dict_contents,int dict_length,int * dict_ptr)235 get_line (char *s, int size, const char *dict_contents, int dict_length,
236     int *dict_ptr)
237 {
238     int len = 0;
239     while (len < (size - 1) && *dict_ptr < dict_length) {
240         s[len++] = *(dict_contents + *dict_ptr);
241         (*dict_ptr)++;
242         if (s[len - 1] == '\n')
243             break;
244     }
245     s[len] = '\0';
246     if (len > 0) {
247         return s;
248     } else {
249         return NULL;
250     }
251 }
252 
253 HyphenDict *
hnj_hyphen_load(const char * fn)254 hnj_hyphen_load (const char *fn)
255 {
256     if (fn == NULL)
257         return NULL;
258     const int fd = open(fn, O_RDONLY);
259     if (fd == -1)
260         return NULL;
261     struct stat sb;
262     if (fstat(fd, &sb) == -1)  {  /* To obtain file size */
263         close(fd);
264         return NULL;
265     }
266 
267     const char *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
268     if (addr == MAP_FAILED) {
269         close(fd);
270         return NULL;
271     }
272     HyphenDict *dict = hnj_hyphen_load_from_buffer(addr, sb.st_size);
273     munmap((void *)addr, sb.st_size);
274     close(fd);
275 
276     return dict;
277 }
278 
279 HyphenDict *
hnj_hyphen_load_from_buffer(const char * dict_contents,int dict_length)280 hnj_hyphen_load_from_buffer (const char *dict_contents, int dict_length)
281 {
282     HyphenDict *dict[2];
283     HashTab *hashtab;
284     char buf[MAX_CHARS];
285     char word[MAX_CHARS];
286     char pattern[MAX_CHARS];
287     char * repl;
288     signed char replindex;
289     signed char replcut;
290     int state_num = 0, last_state;
291     int i, j, k;
292     char ch;
293     int found;
294     HashEntry *e;
295     int nextlevel = 0;
296 
297     if (dict_contents == NULL)
298         return NULL;
299 
300     int dict_ptr = 0;
301 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
302     for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
303         hashtab = hnj_hash_new ();
304 #ifdef VERBOSE
305         global = hashtab;
306 #endif
307         hnj_hash_insert (hashtab, "", 0);
308         dict[k] = hnj_malloc (sizeof(HyphenDict));
309         dict[k]->num_states = 1;
310         dict[k]->states = hnj_malloc (sizeof(HyphenState));
311         dict[k]->states[0].match = NULL;
312         dict[k]->states[0].repl = NULL;
313         dict[k]->states[0].fallback_state = -1;
314         dict[k]->states[0].num_trans = 0;
315         dict[k]->states[0].trans = NULL;
316         dict[k]->nextlevel = NULL;
317         dict[k]->lhmin = 0;
318         dict[k]->rhmin = 0;
319         dict[k]->clhmin = 0;
320         dict[k]->crhmin = 0;
321 
322         /* read in character set info */
323         if (k == 0) {
324             for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
325             get_line(dict[k]->cset, sizeof(dict[k]->cset), dict_contents,
326                 dict_length, &dict_ptr);
327             for (i=0;i<MAX_NAME;i++)
328                 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
329                     dict[k]->cset[i] = 0;
330             dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
331         } else {
332             strcpy(dict[k]->cset, dict[0]->cset);
333             dict[k]->utf8 = dict[0]->utf8;
334         }
335 
336         while (get_line(buf, sizeof(buf), dict_contents, dict_length,
337                 &dict_ptr) != NULL)
338         {
339             if (buf[0] != '%')
340             {
341                 if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
342                     nextlevel = 1;
343                     break;
344                 } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
345                     dict[k]->lhmin = atoi(buf + 13);
346                     continue;
347                 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
348                     dict[k]->rhmin = atoi(buf + 14);
349                     continue;
350                 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
351                     dict[k]->clhmin = atoi(buf + 21);
352                     continue;
353                 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
354                     dict[k]->crhmin = atoi(buf + 22);
355                     continue;
356                 }
357                 j = 0;
358                 pattern[j] = '0';
359                 repl = strchr(buf, '/');
360                 replindex = 0;
361                 replcut = 0;
362                 if (repl) {
363                     char * index = strchr(repl + 1, ',');
364                     *repl = '\0';
365                     if (index) {
366                         char * index2 = strchr(index + 1, ',');
367                         *index = '\0';
368                         if (index2) {
369                             *index2 = '\0';
370                             replindex = (signed char) atoi(index + 1) - 1;
371                             replcut = (signed char) atoi(index2 + 1);
372                         }
373                     } else {
374                         hnj_strchomp(repl + 1);
375                         replindex = 0;
376                         replcut = strlen(buf);
377                     }
378                     repl = hnj_strdup(repl + 1);
379                 }
380                 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
381                 {
382                     if (buf[i] >= '0' && buf[i] <= '9')
383                         pattern[j] = buf[i];
384                     else
385                     {
386                         word[j] = buf[i];
387                         pattern[++j] = '0';
388                     }
389                 }
390                 word[j] = '\0';
391                 pattern[j + 1] = '\0';
392 
393                 i = 0;
394                 if (!repl) {
395                     /* Optimize away leading zeroes */
396                     for (; pattern[i] == '0'; i++);
397                 } else {
398                     if (*word == '.') i++;
399                     /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
400                     if (dict[k]->utf8) {
401                         int pu = -1;        /* unicode character position */
402                         int ps = -1;        /* unicode start position (original replindex) */
403                         int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
404                         for (; pc < (strlen(word) + 1); pc++) {
405                             /* beginning of an UTF-8 character (not '10' start bits) */
406                             if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
407                             if ((ps < 0) && (replindex == pu)) {
408                                 ps = replindex;
409                                 replindex = pc;
410                             }
411                             if ((ps >= 0) && ((pu - ps) == replcut)) {
412                                 replcut = (pc - replindex);
413                                 break;
414                             }
415                         }
416                         if (*word == '.') replindex--;
417                     }
418                 }
419 
420 #ifdef VERBOSE
421                 printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
422 #endif
423                 found = hnj_hash_lookup (hashtab, word);
424                 state_num = hnj_get_state (dict[k], hashtab, word);
425                 dict[k]->states[state_num].match = hnj_strdup (pattern + i);
426                 dict[k]->states[state_num].repl = repl;
427                 dict[k]->states[state_num].replindex = replindex;
428                 if (!replcut) {
429                     dict[k]->states[state_num].replcut = strlen(word);
430                 } else {
431                     dict[k]->states[state_num].replcut = replcut;
432                 }
433 
434                 /* now, put in the prefix transitions */
435                 for (; found < 0 ;j--)
436                 {
437                     last_state = state_num;
438                     ch = word[j - 1];
439                     word[j - 1] = '\0';
440                     found = hnj_hash_lookup (hashtab, word);
441                     state_num = hnj_get_state (dict[k], hashtab, word);
442                     hnj_add_trans (dict[k], state_num, last_state, ch);
443                 }
444             }
445         }
446 
447         /* Could do unioning of matches here (instead of the preprocessor script).
448            If we did, the pseudocode would look something like this:
449 
450            foreach state in the hash table
451            foreach i = [1..length(state) - 1]
452            state to check is substr (state, i)
453            look it up
454            if found, and if there is a match, union the match in.
455 
456            It's also possible to avoid the quadratic blowup by doing the
457            search in order of increasing state string sizes - then you
458            can break the loop after finding the first match.
459 
460            This step should be optional in any case - if there is a
461            preprocessed rule table, it's always faster to use that.
462 
463         */
464 
465         /* put in the fallback states */
466         for (i = 0; i < HASH_SIZE; i++)
467             for (e = hashtab->entries[i]; e; e = e->next)
468             {
469                 if (*(e->key)) for (j = 1; 1; j++)
470                                {
471                                    state_num = hnj_hash_lookup (hashtab, e->key + j);
472                                    if (state_num >= 0)
473                                        break;
474                                }
475                 /* KBH: FIXME state 0 fallback_state should always be -1? */
476                 if (e->val)
477                     dict[k]->states[e->val].fallback_state = state_num;
478             }
479 #ifdef VERBOSE
480         for (i = 0; i < HASH_SIZE; i++)
481             for (e = hashtab->entries[i]; e; e = e->next)
482             {
483                 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
484                     dict[k]->states[e->val].fallback_state);
485                 for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
486                     printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
487                         dict[k]->states[e->val].trans[j].new_state);
488             }
489 #endif
490 
491 #ifndef VERBOSE
492         hnj_hash_free (hashtab);
493 #endif
494         state_num = 0;
495     }
496     if (k == 2) dict[0]->nextlevel = dict[1];
497     return dict[0];
498 }
499 
hnj_hyphen_free(HyphenDict * dict)500 void hnj_hyphen_free (HyphenDict *dict)
501 {
502     int state_num;
503     HyphenState *hstate;
504 
505     for (state_num = 0; state_num < dict->num_states; state_num++)
506     {
507         hstate = &dict->states[state_num];
508         if (hstate->match)
509             hnj_free (hstate->match);
510         if (hstate->repl)
511             hnj_free (hstate->repl);
512         if (hstate->trans)
513             hnj_free (hstate->trans);
514     }
515     if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
516 
517     hnj_free (dict->states);
518 
519     hnj_free (dict);
520 }
521 
522 #define MAX_WORD 256
523 
hnj_hyphen_hyphenate(HyphenDict * dict,const char * word,int word_size,char * hyphens)524 int hnj_hyphen_hyphenate (HyphenDict *dict,
525     const char *word, int word_size,
526     char *hyphens)
527 {
528     char prep_word_buf[MAX_WORD];
529     char *prep_word;
530     int i, j, k;
531     int state;
532     char ch;
533     HyphenState *hstate;
534     char *match;
535     int offset;
536 
537     if (word_size + 3 < MAX_WORD)
538         prep_word = prep_word_buf;
539     else
540         prep_word = hnj_malloc (word_size + 3);
541 
542     j = 0;
543     prep_word[j++] = '.';
544 
545     for (i = 0; i < word_size; i++)
546         prep_word[j++] = word[i];
547 
548     prep_word[j++] = '.';
549     prep_word[j] = '\0';
550 
551     for (i = 0; i < j; i++)
552         hyphens[i] = '0';
553 
554 #ifdef VERBOSE
555     printf ("prep_word = %s\n", prep_word);
556 #endif
557 
558     /* now, run the finite state machine */
559     state = 0;
560     for (i = 0; i < j; i++)
561     {
562         ch = prep_word[i];
563         for (;;)
564         {
565 
566             if (state == -1) {
567                 /* return 1; */
568                 /*  KBH: FIXME shouldn't this be as follows? */
569                 state = 0;
570                 goto try_next_letter;
571             }
572 
573 #ifdef VERBOSE
574             char *state_str;
575             state_str = get_state_str (state);
576 
577             for (k = 0; k < i - strlen (state_str); k++)
578                 putchar (' ');
579             printf ("%s", state_str);
580 #endif
581 
582             hstate = &dict->states[state];
583             for (k = 0; k < hstate->num_trans; k++)
584                 if (hstate->trans[k].ch == ch)
585                 {
586                     state = hstate->trans[k].new_state;
587                     goto found_state;
588                 }
589             state = hstate->fallback_state;
590 #ifdef VERBOSE
591             printf (" falling back, fallback_state %d\n", state);
592 #endif
593         }
594       found_state:
595 #ifdef VERBOSE
596         printf ("found state %d\n",state);
597 #endif
598         /* Additional optimization is possible here - especially,
599            elimination of trailing zeroes from the match. Leading zeroes
600            have already been optimized. */
601         match = dict->states[state].match;
602         /* replacing rules not handled by hyphen_hyphenate() */
603         if (match && !dict->states[state].repl)
604         {
605             offset = i + 1 - strlen (match);
606 #ifdef VERBOSE
607             for (k = 0; k < offset; k++)
608                 putchar (' ');
609             printf ("%s\n", match);
610 #endif
611             /* This is a linear search because I tried a binary search and
612                found it to be just a teeny bit slower. */
613             for (k = 0; match[k]; k++)
614                 if (hyphens[offset + k] < match[k])
615                     hyphens[offset + k] = match[k];
616         }
617 
618         /* KBH: we need this to make sure we keep looking in a word */
619         /* for patterns even if the current character is not known in state 0 */
620         /* since patterns for hyphenation may occur anywhere in the word */
621       try_next_letter: ;
622 
623     }
624 #ifdef VERBOSE
625     for (i = 0; i < j; i++)
626         putchar (hyphens[i]);
627     putchar ('\n');
628 #endif
629 
630     for (i = 0; i < j - 4; i++)
631 #if 0
632         if (hyphens[i + 1] & 1)
633             hyphens[i] = '-';
634 #else
635     hyphens[i] = hyphens[i + 1];
636 #endif
637     hyphens[0] = '0';
638     for (; i < word_size; i++)
639         hyphens[i] = '0';
640     hyphens[word_size] = '\0';
641 
642     if (prep_word != prep_word_buf)
643         hnj_free (prep_word);
644 
645     return 0;
646 }
647 
648 /* character length of the first n byte of the input word */
hnj_hyphen_strnlen(const char * word,int n,int utf8)649 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
650 {
651     int i = 0;
652     int j = 0;
653     while (j < n && word[j] != '\0') {
654         i++;
655         for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
656     }
657     return i;
658 }
659 
hnj_hyphen_lhmin(int utf8,const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut,int lhmin)660 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
661 	char *** rep, int ** pos, int ** cut, int lhmin)
662 {
663     int i, j;
664     for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do {
665             // check length of the non-standard part
666             if (*rep && *pos && *cut && (*rep)[j]) {
667                 char * rh = strchr((*rep)[j], '=');
668                 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
669                         hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
670                     free((*rep)[j]);
671                     (*rep)[j] = NULL;
672                     hyphens[j] = '0';
673                 }
674             } else {
675                 hyphens[j] = '0';
676             }
677             j++;
678         } while (utf8 && (word[j + 1] & 0xc0) == 0xc0);
679     return 0;
680 }
681 
hnj_hyphen_rhmin(int utf8,const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut,int rhmin)682 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
683 	char *** rep, int ** pos, int ** cut, int rhmin)
684 {
685     int i;
686     int j = word_size - 2;
687     for (i = 1; i < rhmin && j > 0; j--) {
688         // check length of the non-standard part
689         if (*rep && *pos && *cut && (*rep)[j]) {
690             char * rh = strchr((*rep)[j], '=');
691             if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
692                     hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
693                 free((*rep)[j]);
694                 (*rep)[j] = NULL;
695                 hyphens[j] = '0';
696             }
697         } else {
698             hyphens[j] = '0';
699         }
700         if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
701     }
702     return 0;
703 }
704 
705 // recursive function for compound level hyphenation
hnj_hyphen_hyph_(HyphenDict * dict,const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut,int clhmin,int crhmin,int lend,int rend)706 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
707     char * hyphens, char *** rep, int ** pos, int ** cut,
708     int clhmin, int crhmin, int lend, int rend)
709 {
710     char prep_word_buf[MAX_WORD];
711     char *prep_word;
712     int i, j, k;
713     int state;
714     char ch;
715     HyphenState *hstate;
716     char *match;
717     char *repl;
718     signed char replindex;
719     signed char replcut;
720     int offset;
721     int matchlen_buf[MAX_CHARS];
722     int matchindex_buf[MAX_CHARS];
723     char * matchrepl_buf[MAX_CHARS];
724     int * matchlen;
725     int * matchindex;
726     char ** matchrepl;
727     int isrepl = 0;
728     int nHyphCount;
729 
730     if (word_size + 3 < MAX_CHARS) {
731         prep_word = prep_word_buf;
732         matchlen = matchlen_buf;
733         matchindex = matchindex_buf;
734         matchrepl = matchrepl_buf;
735     } else {
736         prep_word = hnj_malloc (word_size + 3);
737         matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
738         matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
739         matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
740     }
741 
742     j = 0;
743     prep_word[j++] = '.';
744 
745     for (i = 0; i < word_size; i++)
746         prep_word[j++] = word[i];
747 
748     prep_word[j++] = '.';
749     prep_word[j] = '\0';
750 
751     for (i = 0; i < j; i++)
752         hyphens[i] = '0';
753 
754 #ifdef VERBOSE
755     printf ("prep_word = %s\n", prep_word);
756 #endif
757 
758     /* now, run the finite state machine */
759     state = 0;
760     for (i = 0; i < j; i++)
761     {
762         ch = prep_word[i];
763         for (;;)
764         {
765 
766             if (state == -1) {
767                 /* return 1; */
768                 /*  KBH: FIXME shouldn't this be as follows? */
769                 state = 0;
770                 goto try_next_letter;
771             }
772 
773 #ifdef VERBOSE
774             char *state_str;
775             state_str = get_state_str (state);
776 
777             for (k = 0; k < i - strlen (state_str); k++)
778                 putchar (' ');
779             printf ("%s", state_str);
780 #endif
781 
782             hstate = &dict->states[state];
783             for (k = 0; k < hstate->num_trans; k++)
784                 if (hstate->trans[k].ch == ch)
785                 {
786                     state = hstate->trans[k].new_state;
787                     goto found_state;
788                 }
789             state = hstate->fallback_state;
790 #ifdef VERBOSE
791             printf (" falling back, fallback_state %d\n", state);
792 #endif
793         }
794       found_state:
795 #ifdef VERBOSE
796         printf ("found state %d\n",state);
797 #endif
798         /* Additional optimization is possible here - especially,
799            elimination of trailing zeroes from the match. Leading zeroes
800            have already been optimized. */
801         match = dict->states[state].match;
802         repl = dict->states[state].repl;
803         replindex = dict->states[state].replindex;
804         replcut = dict->states[state].replcut;
805         /* replacing rules not handled by hyphen_hyphenate() */
806         if (match)
807         {
808             offset = i + 1 - strlen (match);
809 #ifdef VERBOSE
810             for (k = 0; k < offset; k++)
811                 putchar (' ');
812             printf ("%s (%s)\n", match, repl);
813 #endif
814             if (repl) {
815                 if (!isrepl) for(; isrepl < word_size; isrepl++) {
816                         matchrepl[isrepl] = NULL;
817                         matchindex[isrepl] = -1;
818                     }
819                 matchlen[offset + replindex] = replcut;
820             }
821             /* This is a linear search because I tried a binary search and
822                found it to be just a teeny bit slower. */
823             for (k = 0; match[k]; k++) {
824                 if ((hyphens[offset + k] < match[k])) {
825                     hyphens[offset + k] = match[k];
826                     if (match[k]&1) {
827                         matchrepl[offset + k] = repl;
828                         if (repl && (k >= replindex) && (k <= replindex + replcut)) {
829                             matchindex[offset + replindex] = offset + k;
830                         }
831                     }
832                 }
833             }
834 
835         }
836 
837         /* KBH: we need this to make sure we keep looking in a word */
838         /* for patterns even if the current character is not known in state 0 */
839         /* since patterns for hyphenation may occur anywhere in the word */
840       try_next_letter: ;
841 
842     }
843 #ifdef VERBOSE
844     for (i = 0; i < j; i++)
845         putchar (hyphens[i]);
846     putchar ('\n');
847 #endif
848 
849     for (i = 0; i < j - 3; i++)
850 #if 0
851         if (hyphens[i + 1] & 1)
852             hyphens[i] = '-';
853 #else
854     hyphens[i] = hyphens[i + 1];
855 #endif
856     for (; i < word_size; i++)
857         hyphens[i] = '0';
858     hyphens[word_size] = '\0';
859 
860     /* now create a new char string showing hyphenation positions */
861     /* count the hyphens and allocate space for the new hyphenated string */
862     nHyphCount = 0;
863     for (i = 0; i < word_size; i++)
864         if (hyphens[i]&1)
865             nHyphCount++;
866     j = 0;
867     for (i = 0; i < word_size; i++) {
868         if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
869             if (rep && pos && cut) {
870                 if (!*rep && !*pos && !*cut) {
871                     int k;
872                     *rep = (char **) malloc(sizeof(char *) * word_size);
873                     *pos = (int *) malloc(sizeof(int) * word_size);
874                     *cut = (int *) malloc(sizeof(int) * word_size);
875                     for (k = 0; k < word_size; k++) {
876                         (*rep)[k] = NULL;
877                         (*pos)[k] = 0;
878                         (*cut)[k] = 0;
879                     }
880                 }
881                 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
882                 (*pos)[matchindex[i] - 1] = matchindex[i] - i;
883                 (*cut)[matchindex[i] - 1] = matchlen[i];
884             }
885             j += strlen(matchrepl[matchindex[i]]);
886             i += matchlen[i] - 1;
887         }
888     }
889 
890     if (matchrepl != matchrepl_buf) {
891         hnj_free (matchrepl);
892         hnj_free (matchlen);
893         hnj_free (matchindex);
894     }
895 
896     // recursive hyphenation of the first (compound) level segments
897     if (dict->nextlevel) {
898         char * rep2_buf[MAX_WORD];
899         int pos2_buf[MAX_WORD];
900         int cut2_buf[MAX_WORD];
901         char hyphens2_buf[MAX_WORD];
902         char ** rep2;
903         int * pos2;
904         int * cut2;
905         char * hyphens2;
906         int begin = 0;
907         if (word_size < MAX_CHARS) {
908             rep2 = rep2_buf;
909             pos2 = pos2_buf;
910             cut2 = cut2_buf;
911             hyphens2 = hyphens2_buf;
912         } else {
913             rep2 = hnj_malloc (word_size * sizeof(char *));
914             pos2 = hnj_malloc (word_size * sizeof(int));
915             cut2 = hnj_malloc (word_size * sizeof(int));
916             hyphens2 = hnj_malloc (word_size);
917         }
918         for (i = 0; i < word_size; i++) rep2[i] = NULL;
919         for (i = 0; i < word_size; i++)
920             if (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
921                 if (i - begin > 1) {
922                     int hyph = 0;
923                     prep_word[i + 2] = '\0';
924                     /* non-standard hyphenation at compound boundary (Schiffahrt) */
925                     if (*rep && *pos && *cut && (*rep)[i]) {
926                         char * l = strchr((*rep)[i], '=');
927                         strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
928                         if (l) {
929                             hyph = (l - (*rep)[i]) - (*pos)[i];
930                             prep_word[2 + i + hyph] = '\0';
931                         }
932                     }
933                     hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
934                         hyphens2, &rep2, &pos2, &cut2, clhmin,
935                         crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
936                     for (j = 0; j < i - begin - 1; j++) {
937                         hyphens[begin + j] = hyphens2[j];
938                         if (rep2[j] && rep && pos && cut) {
939                             if (!*rep && !*pos && !*cut) {
940                                 int k;
941                                 *rep = (char **) malloc(sizeof(char *) * word_size);
942                                 *pos = (int *) malloc(sizeof(int) * word_size);
943                                 *cut = (int *) malloc(sizeof(int) * word_size);
944                                 for (k = 0; k < word_size; k++) {
945                                     (*rep)[k] = NULL;
946                                     (*pos)[k] = 0;
947                                     (*cut)[k] = 0;
948                                 }
949                             }
950                             (*rep)[begin + j] = rep2[j];
951                             (*pos)[begin + j] = pos2[j];
952                             (*cut)[begin + j] = cut2[j];
953                         }
954                     }
955                     prep_word[i + 2] = word[i + 1];
956                     if (*rep && *pos && *cut && (*rep)[i]) {
957                         strcpy(prep_word + 1, word);
958                     }
959                 }
960                 begin = i + 1;
961                 for (j = 0; j < word_size; j++) rep2[j] = NULL;
962             }
963 
964         // non-compound
965         if (begin == 0) {
966             hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
967                 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
968             if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
969                 rep, pos, cut, clhmin);
970             if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
971                 rep, pos, cut, crhmin);
972         }
973 
974         if (rep2 != rep2_buf) {
975             free(rep2);
976             free(cut2);
977             free(pos2);
978             free(hyphens2);
979         }
980     }
981 
982     if (prep_word != prep_word_buf) hnj_free (prep_word);
983     return 0;
984 }
985 
986 /* UTF-8 normalization of hyphen and non-standard positions */
hnj_hyphen_norm(const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut)987 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
988 	char *** rep, int ** pos, int ** cut)
989 {
990     if ((((unsigned char) word[0]) >> 6) == 2) {
991         fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
992         return 1;
993     }
994 
995     /* calculate UTF-8 character positions */
996     int i, j, k;
997     for (i = 0, j = -1; i < word_size; i++) {
998         /* beginning of an UTF-8 character (not '10' start bits) */
999         if ((((unsigned char) word[i]) >> 6) != 2) j++;
1000         hyphens[j] = hyphens[i];
1001         if (rep && pos && cut && *rep && *pos && *cut) {
1002             int l = (*pos)[i];
1003             (*pos)[j] = 0;
1004             for (k = 0; k < l; k++) {
1005                 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1006             }
1007             k = i - l + 1;
1008             l = k + (*cut)[i];
1009             (*cut)[j] = 0;
1010             for (; k < l; k++) {
1011                 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1012             }
1013             (*rep)[j] = (*rep)[i];
1014             if (j < i) {
1015                 (*rep)[i] = NULL;
1016                 (*pos)[i] = 0;
1017                 (*cut)[i] = 0;
1018             }
1019         }
1020     }
1021     hyphens[j + 1] = '\0';
1022     return 0;
1023 }
1024 
1025 /* get the word with all possible hyphenations (output: hyphword) */
hnj_hyphen_hyphword(const char * word,int l,const char * hyphens,char * hyphword,char *** rep,int ** pos,int ** cut)1026 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1027     char * hyphword, char *** rep, int ** pos, int ** cut)
1028 {
1029     int i, j;
1030     for (i = 0, j = 0; i < l; i++, j++) {
1031         if (hyphens[i]&1) {
1032             hyphword[j] = word[i];
1033             if (*rep && *pos && *cut && (*rep)[i]) {
1034                 strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
1035                 j += strlen((*rep)[i]) - (*pos)[i];
1036                 i += (*cut)[i] - (*pos)[i];
1037             } else hyphword[++j] = '=';
1038         } else hyphword[j] = word[i];
1039     }
1040     hyphword[j] = '\0';
1041 }
1042 
1043 
1044 /* main api function with default hyphenmin parameters */
hnj_hyphen_hyphenate2(HyphenDict * dict,const char * word,int word_size,char * hyphens,char * hyphword,char *** rep,int ** pos,int ** cut)1045 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1046     const char *word, int word_size, char * hyphens,
1047     char *hyphword, char *** rep, int ** pos, int ** cut)
1048 {
1049     hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1050         dict->clhmin, dict->crhmin, 1, 1);
1051     hnj_hyphen_lhmin(dict->utf8, word, word_size,
1052         hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1053     hnj_hyphen_rhmin(dict->utf8, word, word_size,
1054         hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1055     if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1056     if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1057     return 0;
1058 }
1059 
1060 /* previous main api function with hyphenmin parameters */
hnj_hyphen_hyphenate3(HyphenDict * dict,const char * word,int word_size,char * hyphens,char * hyphword,char *** rep,int ** pos,int ** cut,int lhmin,int rhmin,int clhmin,int crhmin)1061 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1062 	const char *word, int word_size, char * hyphens,
1063 	char *hyphword, char *** rep, int ** pos, int ** cut,
1064 	int lhmin, int rhmin, int clhmin, int crhmin)
1065 {
1066     lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
1067     rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
1068     hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1069         clhmin, crhmin, 1, 1);
1070     hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1071         rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1072     hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1073         rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1074     if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1075     if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1076     return 0;
1077 }
1078