• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2008 John Maddock
2 //
3 // Use, modification and distribution are subject to the
4 // Boost Software License, Version 1.0.
5 // (See accompanying file LICENSE_1_0.txt
6 // or copy at http://www.boost.org/LICENSE_1_0.txt)
7 
8 #include <set>
9 #include <cstring>
10 #include <boost/array.hpp>
11 #include <boost/exception/all.hpp>
12 #include <boost/program_options.hpp>
13 #include "auto_index.hpp"
14 
15 std::string infile, outfile, prefix, last_primary, last_secondary, last_tertiary;
16 std::set<index_info> index_terms;
17 std::set<std::pair<std::string, std::string> > found_terms;
18 bool no_duplicates = false;
19 bool verbose = false;
20 bool use_section_names = true;
21 index_entry_set index_entries;
22 boost::tiny_xml::element_list indexes;
23 std::list<id_rewrite_rule> id_rewrite_list;
24 bool internal_indexes = false;
25 std::string internal_index_type = "section";
26 boost::regex debug;
27 file_scanner_set_type file_scanner_set;
28 
help()29 int help()
30 {
31    std::cout << "Please refer to the documentation for the correct command line syntax" << std::endl;
32    return 1;
33 }
34 
eat_block(std::string & result,std::istream & is)35 void eat_block(std::string& result, std::istream & is)
36 {
37    //
38    // everything until we get to a closing '>':
39    //
40    char c;
41    while(is.get(c) && c != '>')
42    {
43       result += c;
44       if(c == '\\')
45       {
46          is.get(c);
47          result += c;
48       }
49    }
50    result += c;
51 }
52 
get_header(std::istream & is)53 std::string get_header(std::istream & is)
54 {
55    //
56    // We need to get any leading <? and <! elements:
57    //
58    std::string result;
59    is >> std::ws;
60    if(is.get() != '<')
61       throw std::runtime_error("Invalid leading markup in XML file found");
62    char c = is.peek();
63    while((c == '?') || (c == '!'))
64    {
65       std::string temp;
66       std::getline(is, temp, '>');
67       result += '<' + temp + '>';
68       is >> std::ws;
69       if(is.get() != '<')
70          throw std::runtime_error("Invalid leading markup in XML file found");
71       c = is.peek();
72       result += '\n';
73    }
74    return result;
75 }
76 //
77 // Find attribute named "name" in node "node":
78 //
find_attr(boost::tiny_xml::element_ptr node,const char * name)79 const std::string* find_attr(boost::tiny_xml::element_ptr node, const char* name)
80 {
81    for(boost::tiny_xml::attribute_list::const_iterator i = node->attributes.begin();
82       i != node->attributes.end(); ++i)
83    {
84       if(i->name == name)
85          return &(i->value);
86    }
87    return 0;
88 }
89 //
90 // Get the ID of the current block scope, basically
91 // move up the XML tree until we find a valid ID:
92 //
get_current_block_id(node_id const * id)93 const std::string* get_current_block_id(node_id const* id)
94 {
95    while((id->id == 0) && (id->prev))
96       id = id->prev;
97    if(!id->id)
98       BOOST_THROW_EXCEPTION(std::runtime_error("Current XML block has no enclosing ID: XML is not valid Boostbook?"));
99    return id->id;
100 }
101 //
102 // Get the title of the current block scope, basically
103 // move up the XML tree until we find a valid title:
104 //
get_current_block_title(title_info const * id)105 const std::string& get_current_block_title(title_info const* id)
106 {
107    while((id->title.size() == 0) && (id->prev))
108       id = id->prev;
109    return id->title;
110 }
111 //
112 // Get all the content under this node, with any inline XML
113 // stripped out:
114 //
get_consolidated_content(boost::tiny_xml::element_ptr node)115 std::string get_consolidated_content(boost::tiny_xml::element_ptr node)
116 {
117    std::string result(node->content);
118    for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
119       i != node->elements.end(); ++i)
120    {
121       result += " ";
122       result += get_consolidated_content(*i);
123    }
124    static const boost::regex e("(^[[:space:]]+)|([[:space:]]+)|([[:space:]]+$)");
125    return regex_replace(result, e, "(?2 )", boost::regex_constants::format_all);
126 }
127 //
128 // Rewrite a title based on any rewrite rules we may have:
129 //
rewrite_title(const std::string & title,const std::string & id)130 std::string rewrite_title(const std::string& title, const std::string& id)
131 {
132    for(std::list<id_rewrite_rule>::const_iterator i = id_rewrite_list.begin(); i != id_rewrite_list.end(); ++i)
133    {
134       if(i->base_on_id)
135       {
136          if(regex_match(id, i->id))
137             return i->new_name;
138       }
139       else
140       {
141          if(regex_match(title, i->id))
142             return regex_replace(title, i->id, i->new_name);
143       }
144    }
145    return title;
146 }
147 
148 struct string_cmp
149 {
operator ()string_cmp150    bool operator()(const char* a, const char* b)const
151    {
152       return std::strcmp(a, b) < 0;
153    }
154 };
155 //
156 // Discover whether this node can contain a <title> or not, if not
157 // we don't want to link to it, or the XSL HTML stylesheets may do strange
158 // things, and at least emit copious messages.  See https://sourceforge.net/tracker/?func=detail&aid=3325153&group_id=21935&atid=373747
159 //
can_contain_title(const char * name)160 bool can_contain_title(const char* name)
161 {
162    static const boost::array<const char*, 103> names =
163    { {
164       "abstract", "appendix", "appendixinfo", "article", "articleinfo", "authorblurb", "bibliodiv", "biblioentry", "bibliography",
165        "bibliographyinfo", "bibliolist", "bibliomixed", "bibliomset", "biblioset", "blockinfo", "blockquote", "book", "bookinfo",
166        "calloutlist", "caution", "chapter", "chapterinfo", "colophon", "constraintdef", "dedication", "equation", "example", "figure",
167        "formalpara", "glossary", "glossaryinfo", "glossdiv", "glosslist", "important", "index", "indexdiv", "indexinfo", "itemizedlist",
168        "legalnotice", "lot", "msg", "msgexplan", "msgmain", "msgrel", "msgset", "msgsub", "note", "objectinfo", "orderedlist", "part",
169        "partinfo", "partintro", "personblurb", "preface", "prefaceinfo", "procedure", "productionset", "qandadiv", "qandaset",
170        "refentryinfo", "reference", "referenceinfo", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
171        "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
172        "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "segmentedlist", "set", "setindex",
173        "setindexinfo", "setinfo", "sidebar", "sidebarinfo", "simplesect", "step", "table", "task", "taskprerequisites",
174        "taskrelated", "tasksummary", "tip", "toc", "variablelist", "warning", "refentry"
175    } };
176    static std::set<const char*, string_cmp> permitted;
177 
178    if(permitted.empty())
179       permitted.insert(names.begin(), names.end());
180 
181    return 0 != permitted.count(name);
182 }
183 //
184 // Determine whether this node can contain an indexterm or not:
185 //
can_contain_indexterm(const char * name)186 bool can_contain_indexterm(const char* name)
187 {
188    static const boost::array<const char*, 257> names =
189    { {
190       "abbrev", "accel", "ackno", "acronym", "action", "answer", "appendix", "appendixinfo", "application",
191       "article", "articleinfo", "artpagenums", "attribution", "authorinitials", "bibliocoverage", "bibliodiv",
192       "biblioentry", "bibliography", "bibliographyinfo", "biblioid", "bibliomisc", "bibliomixed", "bibliomset",
193       "bibliorelation", "biblioset", "bibliosource", "blockinfo", "blockquote", "bookinfo", "bridgehead", "callout",
194       "caution", "chapter", "chapterinfo", "citation", "citebiblioid", "citetitle", "city", "classname", "classsynopsisinfo",
195       "code", "collabname", "command", "computeroutput", "confdates", "confnum", "confsponsor", "conftitle", "constant",
196       "constraintdef", "contractnum", "contractsponsor", "contrib", "corpauthor", "corpcredit", "corpname", "country",
197       "database", "date", "dedication", "edition", "email", "emphasis", "entry", "envar", "errorcode", "errorname", "errortext",
198       "errortype", "example", "exceptionname", "fax", "figure", "filename", "firstname", "firstterm", "foreignphrase",
199       "formalpara", "funcparams", "funcsynopsisinfo", "function", "glossary", "glossaryinfo", "glossdef", "glossdiv",
200       "glossentry", "glosssee", "glossseealso", "glossterm", "guibutton", "guiicon", "guilabel", "guimenu", "guimenuitem",
201       "guisubmenu", "hardware", "highlights", "holder", "honorific", "important", "index", "indexinfo", "informalexample",
202       "informalfigure", "initializer", "interface", "interfacename", "invpartnumber", "isbn", "issn", "issuenum", "itemizedlist",
203       "itermset", "jobtitle", "keycap", "keycode", "keysym", "label", "legalnotice", "lineage", "lineannotation",
204       /*"link", */"listitem", "literal", "literallayout", "lotentry", "manvolnum", "markup", "medialabel", "member",
205       "methodname", "modespec", "modifier", "mousebutton", "msgaud", "msgexplan", "msglevel", "msgorig", "msgtext", "note",
206       "objectinfo", "olink", "option", "optional", "orderedlist", "orgdiv", "orgname", "otheraddr", "othername", "package",
207       "pagenums", "para", "parameter", "partinfo", "partintro", "phone", "phrase", "pob", "postcode", "preface", "prefaceinfo",
208       "procedure", "productname", "productnumber", "programlisting", "prompt", "property", "pubdate", "publishername",
209       "pubsnumber", "qandadiv", "qandaset", "question", "quote", "refentry", "refentryinfo", "refentrytitle", "referenceinfo",
210       "refmeta", "refmiscinfo", "refpurpose", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
211       "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "releaseinfo", "remark", "returnvalue",
212       "revdescription", "revnumber", "revremark", "screen", "screeninfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
213       "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "seg", "segtitle", "seriesvolnums",
214       "setindex", "setindexinfo", "setinfo", "sgmltag", "shortaffil", "sidebar", "sidebarinfo", "simpara", "simplesect",
215       "state", "step", "street", "structfield", "structname", "subtitle", "surname", "symbol", "synopsis", "systemitem",
216       "table", "task", "taskprerequisites", "taskrelated", "tasksummary", "td", "term", "termdef", "th", "tip", /*"title",*/
217       "titleabbrev", "tocback", "tocentry", "tocfront", "token", "type", "ulink", "uri", "userinput", "variablelist",
218       "varname", "volumenum", "warning", "wordasword", "year"
219    } };
220    static std::set<const char*, string_cmp> permitted;
221 
222    if(permitted.empty())
223       permitted.insert(names.begin(), names.end());
224 
225    return 0 != permitted.count(name);
226 }
227 //
228 // Decide whether to flatten this node for searching purposes:
229 //
should_flatten_node(const char * name)230 bool should_flatten_node(const char* name)
231 {
232    //
233    // The list of nodes to flatten is basically the list of elements that
234    // can appear inside a <section> - see http://www.docbook.org/tdg/en/html/section.html.
235    // In other words basically anything at the level of a paragraph/table/listing etc.
236    //
237    static const boost::array<const char*, 57> names =
238    { {
239       "title", "subtitle", "titleabbrev",
240       "toc", "lot", "glossary", "bibliography",
241       /*"calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
242       "segmentedlist", "simplelist", "variablelist",*/ "caution", "important", "note",
243       "tip", "warning", "literallayout", "programlisting", "programlistingco",
244       "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
245       "classsynopsis", "fieldsynopsis", "constructorsynopsis",
246       "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
247       "address", "blockquote", "graphic", "graphicco", "mediaobject",
248       "mediaobjectco", "informalequation", "informalexample", "informalfigure",
249       "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
250       "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
251       "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
252       /*"biblioentry", "bibliomixed", "callout", "glossentry", "listitem", "seg", "seglistitem", "member",
253       "term", */
254    } };
255    static std::set<const char*, string_cmp> terminals;
256 
257    if(terminals.empty())
258          terminals.insert(names.begin(), names.end());
259    return 0 != terminals.count(name);
260 }
unescape_xml(const std::string & s)261 std::string unescape_xml(const std::string& s)
262 {
263    boost::regex e("&(?:(quot)|(amp)|(apos)|(lt)|(gt));");
264    return regex_replace(s, e, "(?1\")(?2&)(?3\')(?4<)(?5>)", boost::regex_constants::format_all);
265 }
266 //
267 // Exception classes to propagate processing instruction info:
268 //
269 struct ignore_section{};
270 struct ignore_block{};
271 //
272 // Check if we're in a section (or chapter etc) or not:
273 //
is_section(const std::string & name)274 bool is_section(const std::string& name)
275 {
276    static const boost::array<const char*, 19> data =
277    {{
278       "dedication", "toc", "lot", "glossary", "bibliography", "preface", "chapter",
279       "reference", "part", "article", "appendix", "index", "setindex", "colophon",
280       "sect1", "refentry", "simplesect", "section", "partintro"
281    }};
282    std::set<std::string> names;
283    if(names.empty())
284       names.insert(data.begin(), data.end());
285    return 0 != names.count(name);
286 }
287 //
288 // Check if we're in a block/paragraph or not:
289 //
is_block(const std::string & name)290 bool is_block(const std::string& name)
291 {
292    static const boost::array<const char*, 58> data =
293    {{
294       "calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
295       "segmentedlist", "simplelist", "variablelist", "caution", "important", "note",
296       "tip", "warning", "literallayout", "programlisting", "programlistingco",
297       "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
298       "classsynopsis", "fieldsynopsis", "constructorsynopsis",
299       "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
300       "address", "blockquote", "graphic", "graphicco", "mediaobject",
301       "mediaobjectco", "informalequation", "informalexample", "informalfigure",
302       "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
303       "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
304       "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
305    }};
306    std::set<std::string> names;
307    if(names.empty())
308       names.insert(data.begin(), data.end());
309    return 0 != names.count(name);
310 }
311 //
312 // Helper proc to recurse through children:
313 //
314 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen);
recurse_through_children(boost::tiny_xml::element_ptr node,node_id * id,title_info * pt,bool seen)315 bool recurse_through_children(boost::tiny_xml::element_ptr node, node_id* id, title_info* pt, bool seen)
316 {
317    try
318    {
319       for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
320          i != node->elements.end(); ++i)
321       {
322          process_node(*i, id, pt, seen);
323       }
324    }
325    catch(const ignore_section&)
326    {
327       if(is_section(node->name))
328          return false;
329       else
330          throw;
331    }
332    catch(const ignore_block&)
333    {
334       if(is_block(node->name) || is_section(node->name))
335          return false;
336       else
337          throw;
338    }
339    return true;
340 }
341 //
342 // This does most of the work: process the node pointed to, and any children
343 // that it may have:
344 //
process_node(boost::tiny_xml::element_ptr node,node_id * prev,title_info * pt,bool seen=false)345 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen = false)
346 {
347    //
348    // Store the current ID and title as nested scoped objects:
349    //
350    node_id id = { 0, prev };
351    if(can_contain_title(node->name.c_str()))
352    {
353       // Only set the ID to link to if the block can contain a title, see
354       // can_contain_title above for rationale.
355       id.id = find_attr(node, "id");
356    }
357    title_info title = { "", pt};
358    bool flatten = should_flatten_node(node->name.c_str());
359 
360    if(node->name.size() && node->name[0] == '?')
361    {
362       if(node->name == "?BoostAutoIndex")
363       {
364          if(node->content == "IgnoreSection")
365          {
366             throw ignore_section();
367          }
368          else if(node->content == "IgnoreBlock")
369          {
370             throw ignore_block();
371          }
372       }
373       return; // Ignore processing instructions
374    }
375    else if((node->name == "title") && (id.prev->id))
376    {
377       //
378       // This actually sets the title of the enclosing scope,
379       // not this tag itself:
380       //
381       title.prev->title = get_consolidated_content(node);
382       if(verbose)
383          std::cout << "Indexing section: " << title.prev->title << std::endl;
384    }
385    else if((node->name == "refentrytitle") && (id.prev->prev->id))
386    {
387       //
388       // This actually sets the title of the enclosing refentry scope,
389       // not this tag itself:
390       //
391       title.prev->prev->title = get_consolidated_content(node);
392       if(verbose)
393          std::cout << "Indexing refentry: " << title.prev->prev->title << std::endl;
394    }
395    if(node->name == "anchor")
396    {
397       if(node->parent.lock()->name == "title")
398       {
399          // We have a title with a nested anchor ID, change the ID of our parents parent to match:
400          id.prev->prev->id = id.id;
401       }
402    }
403    else if(node->name == "index")
404    {
405       // Keep track of all the indexes we see:
406       indexes.push_back(node);
407       if(node->parent.lock()->name == "para")
408          node->parent.lock()->name = "";
409    }
410    else if(node->name == "primary")
411    {
412       last_primary = get_consolidated_content(node);
413    }
414    else if(node->name == "secondary")
415    {
416       last_secondary = get_consolidated_content(node);
417    }
418    else if(node->name == "tertiary")
419    {
420       last_tertiary = get_consolidated_content(node);
421    }
422    else if((node->name == "see") && internal_indexes)
423    {
424       std::cerr << "WARNING: <see> in XML source will be ignored for the index generation" << std::endl;
425    }
426    else if((node->name == "seealso") && internal_indexes)
427    {
428       std::cerr << "WARNING: <seealso> in XML source will be ignored for the index generation" << std::endl;
429    }
430 
431    std::string flattenned_text;
432    const std::string* ptext;
433    if(flatten)
434    {
435       flattenned_text = unescape_xml(get_consolidated_content(node));
436       ptext = &flattenned_text;
437       //
438       // Recurse through children here if we're going to flatten the text, that way we see any processing instructions first:
439       //
440       if(!recurse_through_children(node, &id, &title, flatten || seen))
441          return;
442    }
443    else
444    {
445       ptext = &(node->content);
446    }
447 
448    //
449    // Search content for items: we only search if the content is not empty,
450    // and the content is not whitespace alone, and we haven't already searched this
451    // text in one of our parent nodes that got flattened.
452    //
453    static const boost::regex space_re("[[:space:]]+");
454    if(!seen && ptext->size() && !regex_match(*ptext, space_re))
455    {
456       // Save block ID and title in case we find some hits:
457       const std::string* pid = get_current_block_id(&id);
458       const std::string& rtitle = get_current_block_title(&title);
459       const std::string simple_title = rewrite_title(rtitle, *pid);
460       // Scan for each index term:
461       for(std::set<index_info>::const_iterator i = index_terms.begin();
462             i != index_terms.end(); ++i)
463       {
464          if(regex_search(*ptext, i->search_text))
465          {
466             //
467             // We need to check to see if this term has already been indexed
468             // in this zone, in order to prevent duplicate entries, also check
469             // that any constrait placed on the term's ID is satisfied:
470             //
471             std::pair<std::string, std::string> item_index(*pid, i->term);
472             if(((no_duplicates == false) || (0 == found_terms.count(item_index)))
473                && (i->search_id.empty() || regex_match(*pid, i->search_id)))
474             {
475                // We have something to index!
476                found_terms.insert(item_index);
477 
478                if(!debug.empty() && (regex_match(i->term, debug) || regex_match(rtitle, debug) || regex_match(simple_title, debug)))
479                {
480                   std::cout << "Debug term found, in block with ID: " << *pid << std::endl;
481                   std::cout << "Current section title is: " << rtitle << std::endl;
482                   std::cout << "The main index entry will be : " << simple_title << std::endl;
483                   std::cout << "The indexed term is: " << i->term << std::endl;
484                   std::cout << "The search regex is: " << i->search_text << std::endl;
485                   std::cout << "The section constraint is: " << i->search_id << std::endl;
486                   std::cout << "The index type for this entry is: " << i->category << std::endl;
487                }
488 
489                if(use_section_names && (simple_title != i->term))
490                {
491                   //
492                   // First off insert index entry with primary term
493                   // consisting of the section title, and secondary term the
494                   // actual index term, this gets skipped if the title and index
495                   // term are the same:
496                   //
497                   if(internal_indexes == false)
498                   {
499                      // Insert an <indexterm> into the XML:
500                      boost::tiny_xml::element_ptr p(new boost::tiny_xml::element());
501                      p->name = "indexterm";
502                      boost::tiny_xml::element_ptr prim(new boost::tiny_xml::element());
503                      prim->name = "primary";
504                      prim->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
505                      prim->elements.front()->content = simple_title;
506                      p->elements.push_front(prim);
507 
508                      boost::tiny_xml::element_ptr sec(new boost::tiny_xml::element());
509                      sec->name = "secondary";
510                      sec->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
511                      sec->elements.front()->content = i->term;
512                      p->elements.push_back(sec);
513                      try{
514                         // Insert the Indexterm:
515                         boost::tiny_xml::element_ptr parent(node->parent);
516                         while(!can_contain_indexterm(parent->name.c_str()))
517                            parent = parent->parent.lock();
518                         parent->elements.push_front(p);
519                      }
520                      catch(const std::exception&)
521                      {
522                         std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
523                      }
524                   }
525                   // Track the entry in our internal index:
526                   index_entry_ptr item1(new index_entry(simple_title));
527                   index_entry_ptr item2(new index_entry(i->term, *pid));
528                   index_entry_set::iterator pos = index_entries.insert(item1).first;
529                   (**pos).sub_keys.insert(item2);
530                }
531                //
532                // Now insert another index entry with the index term
533                // as the primary key, and the section title as the
534                // secondary key, this one gets assigned to the
535                // appropriate index category if there is one:
536                //
537                bool preferred_term = false;
538                if(internal_indexes == false)
539                {
540                   // Insert <indexterm> into the XML:
541                   boost::tiny_xml::element_ptr p2(new boost::tiny_xml::element());
542                   p2->name = "indexterm";
543                   if(i->category.size())
544                   {
545                      p2->attributes.push_back(boost::tiny_xml::attribute("type", i->category));
546                   }
547                   boost::tiny_xml::element_ptr prim2(new boost::tiny_xml::element());
548                   prim2->name = "primary";
549                   prim2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
550                   prim2->elements.front()->content = i->term;
551                   p2->elements.push_front(prim2);
552 
553                   boost::tiny_xml::element_ptr sec2(new boost::tiny_xml::element());
554                   sec2->name = "secondary";
555                   sec2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
556                   sec2->elements.front()->content = rtitle;
557                   p2->elements.push_back(sec2);
558                   try{
559                      // Insert the Indexterm:
560                      boost::tiny_xml::element_ptr parent(node->parent);
561                      while(!can_contain_indexterm(parent->name.c_str()))
562                      {
563                         // If the search text was found in a title then make it a preferred term:
564                         if(parent->name == "title")
565                            preferred_term = true;
566                         parent = parent->parent.lock();
567                      }
568                      if(preferred_term)
569                      {
570                         boost::tiny_xml::attribute a("significance", "preferred");
571                         p2->attributes.push_back(a);
572                      }
573                      parent->elements.push_front(p2);
574                   }
575                   catch(const std::exception&)
576                   {
577                      std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
578                   }
579                }
580 
581                // Track the entry in our internal index:
582                try{
583                   // figure out if it's preferred or not:
584                   boost::tiny_xml::element_ptr parent(node->parent);
585                   while(!can_contain_indexterm(parent->name.c_str()))
586                   {
587                      // If the search text was found in a title then make it a preferred term:
588                      if(parent->name == "title")
589                      {
590                         preferred_term = true;
591                      }
592                      parent = parent->parent.lock();
593                      if(!parent)
594                         break;
595                   }
596                }
597                catch(const std::exception&){}
598 
599                index_entry_ptr item3(new index_entry(i->term));
600                if(i->category.size())
601                   item3->category = i->category;
602                index_entry_ptr item4(new index_entry(rtitle, *pid));
603                item4->preferred = preferred_term;
604                index_entry_set::iterator pos = index_entries.insert(item3).first;
605                (**pos).sub_keys.insert(item4);
606             }
607          }
608       }
609    }
610    //
611    // Recurse through children, if not done already:
612    //
613    if(!flatten)
614       recurse_through_children(node, &id, &title, flatten || seen);
615    //
616    // Process manual index entries last of all:
617    //
618    if(node->name == "indexterm")
619    {
620       // Track the entry in our internal index:
621       const std::string* pid = get_current_block_id(&id);
622       const std::string* attr = find_attr(node, "type");
623       const std::string& rtitle = get_current_block_title(&title);
624       const std::string simple_title = rewrite_title(rtitle, *pid);
625       index_entry_ptr item1(new index_entry(last_primary, "", attr ? *attr : ""));
626       index_entry_set* parent = &((*index_entries.insert(item1).first)->sub_keys);
627 
628       if(last_secondary.size())
629       {
630          item1.reset(new index_entry(last_secondary, "", attr ? *attr : ""));
631          parent = &((*parent->insert(item1).first)->sub_keys);
632       }
633       if(last_tertiary.size())
634       {
635          item1.reset(new index_entry(last_tertiary, "", attr ? *attr : ""));
636          parent = &((*parent->insert(item1).first)->sub_keys);
637       }
638       item1.reset(new index_entry(simple_title, *pid, attr ? *attr : ""));
639       parent->insert(item1);
640 
641       last_primary = "";
642       last_secondary = "";
643       last_tertiary = "";
644    }
645 }
646 
process_nodes(boost::tiny_xml::element_ptr node)647 void process_nodes(boost::tiny_xml::element_ptr node)
648 {
649    node_id id = { 0, };
650    title_info t = { "", 0 };
651    process_node(node, &id, &t);
652 }
653 
main(int argc,char * argv[])654 int main(int argc, char* argv[])
655 {
656    try{
657 
658    namespace po = boost::program_options;
659    po::options_description desc("AutoIndex Allowed Options");
660    desc.add_options()
661       ("help", "Print help message")
662       ("in", po::value<std::string>(), "Set the input XML file.")
663       ("out", po::value<std::string>(), "Set output input XML file.")
664       ("scan", po::value<std::string>(), "Scan the specified file for terms to try and index.")
665       ("script", po::value<std::string>(), "Specifies the script file to use.")
666       ("no-duplicates", "Prevents duplicate index entries within the same section.")
667       ("no-section-names", "Suppresses use of section names as index entries.")
668       ("internal-index", "Causes AutoIndex to generate the index itself, rather than relying on the XSL stylesheets.")
669       ("verbose", "Turns on verbose mode.")
670       ("prefix", po::value<std::string>(), "Sets the prefix to be prepended to all file names and paths in the script file.")
671       ("index-type", po::value<std::string>(), "Sets the XML container type to use the index.")
672    ;
673 
674    po::variables_map vm;
675    po::store(po::parse_command_line(argc, argv, desc), vm);
676    po::notify(vm);
677 
678    //
679    // Process arguments:
680    //
681    if(vm.count("help"))
682    {
683       std::cout << desc;
684       return 0;
685    }
686    if(vm.count("in"))
687    {
688       infile = vm["in"].as<std::string>();
689    }
690    else
691    {
692       std::cerr << "No input XML file specified" << std::endl;
693       return 1;
694    }
695    if(vm.count("out"))
696    {
697       outfile = vm["out"].as<std::string>();
698    }
699    else
700    {
701       std::cerr << "No output XML file specified" << std::endl;
702       return 1;
703    }
704    if(vm.count("verbose"))
705    {
706       verbose = true;
707    }
708    if(vm.count("prefix"))
709    {
710       prefix = vm["prefix"].as<std::string>();
711    }
712    if(vm.count("scan"))
713    {
714       std::string f = vm["scan"].as<std::string>();
715       if(!exists(boost::filesystem::path(f)))
716          throw std::runtime_error("Error the file requested for scanning does not exist: " + f);
717       scan_file(f);
718    }
719    if(vm.count("script"))
720    {
721       process_script(vm["script"].as<std::string>());
722    }
723    if(vm.count("no-duplicates"))
724    {
725       no_duplicates = true;
726    }
727    if(vm.count("no-section-names"))
728    {
729       use_section_names = false;
730    }
731    if(vm.count("internal-index"))
732    {
733       internal_indexes = true;
734    }
735    if(vm.count("index-type"))
736    {
737       internal_index_type = vm["index-type"].as<std::string>();
738    }
739 
740    std::ifstream is(infile.c_str());
741    if((0 == is.peek()) || !is.good())
742    {
743       std::cerr << "Unable to open XML data file " << argv[1] << std::endl;
744       return 1;
745    }
746    //
747    // We need to skip any leading <? and <! elements:
748    //
749    std::string header = get_header(is);
750    boost::tiny_xml::element_ptr xml = boost::tiny_xml::parse(is, "");
751    is.close();
752 
753    std::cout << "Indexing " << index_terms.size() << " terms..." << std::endl;
754 
755    process_nodes(xml);
756 
757    if(internal_indexes)
758       generate_indexes();
759 
760    std::ofstream os(outfile.c_str());
761    os << header << std::endl;
762    boost::tiny_xml::write(*xml, os);
763    std::cout << index_entries.size() << " Index entries were created." << std::endl;
764 
765    }
766    catch(boost::exception& e)
767    {
768       std::cerr << diagnostic_information(e);
769       return 1;
770    }
771    catch(const std::exception& e)
772    {
773       std::cerr << e.what() << std::endl;
774       return 1;
775    }
776    catch(const std::string& s)
777    {
778       std::cerr << s << std::endl;
779       return 1;
780    }
781 
782    return 0;
783 }
784