1 // Copyright 2008 John Maddock
2 //
3 // Use, modification and distribution are subject to the
4 // Boost Software License, Version 1.0.
5 // (See accompanying file LICENSE_1_0.txt
6 // or copy at http://www.boost.org/LICENSE_1_0.txt)
7
8 #include <set>
9 #include <cstring>
10 #include <boost/array.hpp>
11 #include <boost/exception/all.hpp>
12 #include <boost/program_options.hpp>
13 #include "auto_index.hpp"
14
15 std::string infile, outfile, prefix, last_primary, last_secondary, last_tertiary;
16 std::set<index_info> index_terms;
17 std::set<std::pair<std::string, std::string> > found_terms;
18 bool no_duplicates = false;
19 bool verbose = false;
20 bool use_section_names = true;
21 index_entry_set index_entries;
22 boost::tiny_xml::element_list indexes;
23 std::list<id_rewrite_rule> id_rewrite_list;
24 bool internal_indexes = false;
25 std::string internal_index_type = "section";
26 boost::regex debug;
27 file_scanner_set_type file_scanner_set;
28
help()29 int help()
30 {
31 std::cout << "Please refer to the documentation for the correct command line syntax" << std::endl;
32 return 1;
33 }
34
eat_block(std::string & result,std::istream & is)35 void eat_block(std::string& result, std::istream & is)
36 {
37 //
38 // everything until we get to a closing '>':
39 //
40 char c;
41 while(is.get(c) && c != '>')
42 {
43 result += c;
44 if(c == '\\')
45 {
46 is.get(c);
47 result += c;
48 }
49 }
50 result += c;
51 }
52
get_header(std::istream & is)53 std::string get_header(std::istream & is)
54 {
55 //
56 // We need to get any leading <? and <! elements:
57 //
58 std::string result;
59 is >> std::ws;
60 if(is.get() != '<')
61 throw std::runtime_error("Invalid leading markup in XML file found");
62 char c = is.peek();
63 while((c == '?') || (c == '!'))
64 {
65 std::string temp;
66 std::getline(is, temp, '>');
67 result += '<' + temp + '>';
68 is >> std::ws;
69 if(is.get() != '<')
70 throw std::runtime_error("Invalid leading markup in XML file found");
71 c = is.peek();
72 result += '\n';
73 }
74 return result;
75 }
76 //
77 // Find attribute named "name" in node "node":
78 //
find_attr(boost::tiny_xml::element_ptr node,const char * name)79 const std::string* find_attr(boost::tiny_xml::element_ptr node, const char* name)
80 {
81 for(boost::tiny_xml::attribute_list::const_iterator i = node->attributes.begin();
82 i != node->attributes.end(); ++i)
83 {
84 if(i->name == name)
85 return &(i->value);
86 }
87 return 0;
88 }
89 //
90 // Get the ID of the current block scope, basically
91 // move up the XML tree until we find a valid ID:
92 //
get_current_block_id(node_id const * id)93 const std::string* get_current_block_id(node_id const* id)
94 {
95 while((id->id == 0) && (id->prev))
96 id = id->prev;
97 if(!id->id)
98 BOOST_THROW_EXCEPTION(std::runtime_error("Current XML block has no enclosing ID: XML is not valid Boostbook?"));
99 return id->id;
100 }
101 //
102 // Get the title of the current block scope, basically
103 // move up the XML tree until we find a valid title:
104 //
get_current_block_title(title_info const * id)105 const std::string& get_current_block_title(title_info const* id)
106 {
107 while((id->title.size() == 0) && (id->prev))
108 id = id->prev;
109 return id->title;
110 }
111 //
112 // Get all the content under this node, with any inline XML
113 // stripped out:
114 //
get_consolidated_content(boost::tiny_xml::element_ptr node)115 std::string get_consolidated_content(boost::tiny_xml::element_ptr node)
116 {
117 std::string result(node->content);
118 for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
119 i != node->elements.end(); ++i)
120 {
121 result += " ";
122 result += get_consolidated_content(*i);
123 }
124 static const boost::regex e("(^[[:space:]]+)|([[:space:]]+)|([[:space:]]+$)");
125 return regex_replace(result, e, "(?2 )", boost::regex_constants::format_all);
126 }
127 //
128 // Rewrite a title based on any rewrite rules we may have:
129 //
rewrite_title(const std::string & title,const std::string & id)130 std::string rewrite_title(const std::string& title, const std::string& id)
131 {
132 for(std::list<id_rewrite_rule>::const_iterator i = id_rewrite_list.begin(); i != id_rewrite_list.end(); ++i)
133 {
134 if(i->base_on_id)
135 {
136 if(regex_match(id, i->id))
137 return i->new_name;
138 }
139 else
140 {
141 if(regex_match(title, i->id))
142 return regex_replace(title, i->id, i->new_name);
143 }
144 }
145 return title;
146 }
147
148 struct string_cmp
149 {
operator ()string_cmp150 bool operator()(const char* a, const char* b)const
151 {
152 return std::strcmp(a, b) < 0;
153 }
154 };
155 //
156 // Discover whether this node can contain a <title> or not, if not
157 // we don't want to link to it, or the XSL HTML stylesheets may do strange
158 // things, and at least emit copious messages. See https://sourceforge.net/tracker/?func=detail&aid=3325153&group_id=21935&atid=373747
159 //
can_contain_title(const char * name)160 bool can_contain_title(const char* name)
161 {
162 static const boost::array<const char*, 103> names =
163 { {
164 "abstract", "appendix", "appendixinfo", "article", "articleinfo", "authorblurb", "bibliodiv", "biblioentry", "bibliography",
165 "bibliographyinfo", "bibliolist", "bibliomixed", "bibliomset", "biblioset", "blockinfo", "blockquote", "book", "bookinfo",
166 "calloutlist", "caution", "chapter", "chapterinfo", "colophon", "constraintdef", "dedication", "equation", "example", "figure",
167 "formalpara", "glossary", "glossaryinfo", "glossdiv", "glosslist", "important", "index", "indexdiv", "indexinfo", "itemizedlist",
168 "legalnotice", "lot", "msg", "msgexplan", "msgmain", "msgrel", "msgset", "msgsub", "note", "objectinfo", "orderedlist", "part",
169 "partinfo", "partintro", "personblurb", "preface", "prefaceinfo", "procedure", "productionset", "qandadiv", "qandaset",
170 "refentryinfo", "reference", "referenceinfo", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
171 "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
172 "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "segmentedlist", "set", "setindex",
173 "setindexinfo", "setinfo", "sidebar", "sidebarinfo", "simplesect", "step", "table", "task", "taskprerequisites",
174 "taskrelated", "tasksummary", "tip", "toc", "variablelist", "warning", "refentry"
175 } };
176 static std::set<const char*, string_cmp> permitted;
177
178 if(permitted.empty())
179 permitted.insert(names.begin(), names.end());
180
181 return 0 != permitted.count(name);
182 }
183 //
184 // Determine whether this node can contain an indexterm or not:
185 //
can_contain_indexterm(const char * name)186 bool can_contain_indexterm(const char* name)
187 {
188 static const boost::array<const char*, 257> names =
189 { {
190 "abbrev", "accel", "ackno", "acronym", "action", "answer", "appendix", "appendixinfo", "application",
191 "article", "articleinfo", "artpagenums", "attribution", "authorinitials", "bibliocoverage", "bibliodiv",
192 "biblioentry", "bibliography", "bibliographyinfo", "biblioid", "bibliomisc", "bibliomixed", "bibliomset",
193 "bibliorelation", "biblioset", "bibliosource", "blockinfo", "blockquote", "bookinfo", "bridgehead", "callout",
194 "caution", "chapter", "chapterinfo", "citation", "citebiblioid", "citetitle", "city", "classname", "classsynopsisinfo",
195 "code", "collabname", "command", "computeroutput", "confdates", "confnum", "confsponsor", "conftitle", "constant",
196 "constraintdef", "contractnum", "contractsponsor", "contrib", "corpauthor", "corpcredit", "corpname", "country",
197 "database", "date", "dedication", "edition", "email", "emphasis", "entry", "envar", "errorcode", "errorname", "errortext",
198 "errortype", "example", "exceptionname", "fax", "figure", "filename", "firstname", "firstterm", "foreignphrase",
199 "formalpara", "funcparams", "funcsynopsisinfo", "function", "glossary", "glossaryinfo", "glossdef", "glossdiv",
200 "glossentry", "glosssee", "glossseealso", "glossterm", "guibutton", "guiicon", "guilabel", "guimenu", "guimenuitem",
201 "guisubmenu", "hardware", "highlights", "holder", "honorific", "important", "index", "indexinfo", "informalexample",
202 "informalfigure", "initializer", "interface", "interfacename", "invpartnumber", "isbn", "issn", "issuenum", "itemizedlist",
203 "itermset", "jobtitle", "keycap", "keycode", "keysym", "label", "legalnotice", "lineage", "lineannotation",
204 /*"link", */"listitem", "literal", "literallayout", "lotentry", "manvolnum", "markup", "medialabel", "member",
205 "methodname", "modespec", "modifier", "mousebutton", "msgaud", "msgexplan", "msglevel", "msgorig", "msgtext", "note",
206 "objectinfo", "olink", "option", "optional", "orderedlist", "orgdiv", "orgname", "otheraddr", "othername", "package",
207 "pagenums", "para", "parameter", "partinfo", "partintro", "phone", "phrase", "pob", "postcode", "preface", "prefaceinfo",
208 "procedure", "productname", "productnumber", "programlisting", "prompt", "property", "pubdate", "publishername",
209 "pubsnumber", "qandadiv", "qandaset", "question", "quote", "refentry", "refentryinfo", "refentrytitle", "referenceinfo",
210 "refmeta", "refmiscinfo", "refpurpose", "refsect1", "refsect1info", "refsect2", "refsect2info", "refsect3", "refsect3info",
211 "refsection", "refsectioninfo", "refsynopsisdiv", "refsynopsisdivinfo", "releaseinfo", "remark", "returnvalue",
212 "revdescription", "revnumber", "revremark", "screen", "screeninfo", "sect1", "sect1info", "sect2", "sect2info", "sect3",
213 "sect3info", "sect4", "sect4info", "sect5", "sect5info", "section", "sectioninfo", "seg", "segtitle", "seriesvolnums",
214 "setindex", "setindexinfo", "setinfo", "sgmltag", "shortaffil", "sidebar", "sidebarinfo", "simpara", "simplesect",
215 "state", "step", "street", "structfield", "structname", "subtitle", "surname", "symbol", "synopsis", "systemitem",
216 "table", "task", "taskprerequisites", "taskrelated", "tasksummary", "td", "term", "termdef", "th", "tip", /*"title",*/
217 "titleabbrev", "tocback", "tocentry", "tocfront", "token", "type", "ulink", "uri", "userinput", "variablelist",
218 "varname", "volumenum", "warning", "wordasword", "year"
219 } };
220 static std::set<const char*, string_cmp> permitted;
221
222 if(permitted.empty())
223 permitted.insert(names.begin(), names.end());
224
225 return 0 != permitted.count(name);
226 }
227 //
228 // Decide whether to flatten this node for searching purposes:
229 //
should_flatten_node(const char * name)230 bool should_flatten_node(const char* name)
231 {
232 //
233 // The list of nodes to flatten is basically the list of elements that
234 // can appear inside a <section> - see http://www.docbook.org/tdg/en/html/section.html.
235 // In other words basically anything at the level of a paragraph/table/listing etc.
236 //
237 static const boost::array<const char*, 57> names =
238 { {
239 "title", "subtitle", "titleabbrev",
240 "toc", "lot", "glossary", "bibliography",
241 /*"calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
242 "segmentedlist", "simplelist", "variablelist",*/ "caution", "important", "note",
243 "tip", "warning", "literallayout", "programlisting", "programlistingco",
244 "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
245 "classsynopsis", "fieldsynopsis", "constructorsynopsis",
246 "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
247 "address", "blockquote", "graphic", "graphicco", "mediaobject",
248 "mediaobjectco", "informalequation", "informalexample", "informalfigure",
249 "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
250 "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
251 "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
252 /*"biblioentry", "bibliomixed", "callout", "glossentry", "listitem", "seg", "seglistitem", "member",
253 "term", */
254 } };
255 static std::set<const char*, string_cmp> terminals;
256
257 if(terminals.empty())
258 terminals.insert(names.begin(), names.end());
259 return 0 != terminals.count(name);
260 }
unescape_xml(const std::string & s)261 std::string unescape_xml(const std::string& s)
262 {
263 boost::regex e("&(?:(quot)|(amp)|(apos)|(lt)|(gt));");
264 return regex_replace(s, e, "(?1\")(?2&)(?3\')(?4<)(?5>)", boost::regex_constants::format_all);
265 }
266 //
267 // Exception classes to propagate processing instruction info:
268 //
269 struct ignore_section{};
270 struct ignore_block{};
271 //
272 // Check if we're in a section (or chapter etc) or not:
273 //
is_section(const std::string & name)274 bool is_section(const std::string& name)
275 {
276 static const boost::array<const char*, 19> data =
277 {{
278 "dedication", "toc", "lot", "glossary", "bibliography", "preface", "chapter",
279 "reference", "part", "article", "appendix", "index", "setindex", "colophon",
280 "sect1", "refentry", "simplesect", "section", "partintro"
281 }};
282 std::set<std::string> names;
283 if(names.empty())
284 names.insert(data.begin(), data.end());
285 return 0 != names.count(name);
286 }
287 //
288 // Check if we're in a block/paragraph or not:
289 //
is_block(const std::string & name)290 bool is_block(const std::string& name)
291 {
292 static const boost::array<const char*, 58> data =
293 {{
294 "calloutlist", "glosslist", "bibliolist", "itemizedlist", "orderedlist",
295 "segmentedlist", "simplelist", "variablelist", "caution", "important", "note",
296 "tip", "warning", "literallayout", "programlisting", "programlistingco",
297 "screen", "screenco", "screenshot", "synopsis", "cmdsynopsis", "funcsynopsis",
298 "classsynopsis", "fieldsynopsis", "constructorsynopsis",
299 "destructorsynopsis", "methodsynopsis", "formalpara", "para", "simpara",
300 "address", "blockquote", "graphic", "graphicco", "mediaobject",
301 "mediaobjectco", "informalequation", "informalexample", "informalfigure",
302 "informaltable", "equation", "example", "figure", "table", "msgset", "procedure",
303 "sidebar", "qandaset", "task", "productionset", "constraintdef", "anchor",
304 "bridgehead", "remark", "highlights", "abstract", "authorblurb", "epigraph"
305 }};
306 std::set<std::string> names;
307 if(names.empty())
308 names.insert(data.begin(), data.end());
309 return 0 != names.count(name);
310 }
311 //
312 // Helper proc to recurse through children:
313 //
314 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen);
recurse_through_children(boost::tiny_xml::element_ptr node,node_id * id,title_info * pt,bool seen)315 bool recurse_through_children(boost::tiny_xml::element_ptr node, node_id* id, title_info* pt, bool seen)
316 {
317 try
318 {
319 for(boost::tiny_xml::element_list::const_iterator i = node->elements.begin();
320 i != node->elements.end(); ++i)
321 {
322 process_node(*i, id, pt, seen);
323 }
324 }
325 catch(const ignore_section&)
326 {
327 if(is_section(node->name))
328 return false;
329 else
330 throw;
331 }
332 catch(const ignore_block&)
333 {
334 if(is_block(node->name) || is_section(node->name))
335 return false;
336 else
337 throw;
338 }
339 return true;
340 }
341 //
342 // This does most of the work: process the node pointed to, and any children
343 // that it may have:
344 //
process_node(boost::tiny_xml::element_ptr node,node_id * prev,title_info * pt,bool seen=false)345 void process_node(boost::tiny_xml::element_ptr node, node_id* prev, title_info* pt, bool seen = false)
346 {
347 //
348 // Store the current ID and title as nested scoped objects:
349 //
350 node_id id = { 0, prev };
351 if(can_contain_title(node->name.c_str()))
352 {
353 // Only set the ID to link to if the block can contain a title, see
354 // can_contain_title above for rationale.
355 id.id = find_attr(node, "id");
356 }
357 title_info title = { "", pt};
358 bool flatten = should_flatten_node(node->name.c_str());
359
360 if(node->name.size() && node->name[0] == '?')
361 {
362 if(node->name == "?BoostAutoIndex")
363 {
364 if(node->content == "IgnoreSection")
365 {
366 throw ignore_section();
367 }
368 else if(node->content == "IgnoreBlock")
369 {
370 throw ignore_block();
371 }
372 }
373 return; // Ignore processing instructions
374 }
375 else if((node->name == "title") && (id.prev->id))
376 {
377 //
378 // This actually sets the title of the enclosing scope,
379 // not this tag itself:
380 //
381 title.prev->title = get_consolidated_content(node);
382 if(verbose)
383 std::cout << "Indexing section: " << title.prev->title << std::endl;
384 }
385 else if((node->name == "refentrytitle") && (id.prev->prev->id))
386 {
387 //
388 // This actually sets the title of the enclosing refentry scope,
389 // not this tag itself:
390 //
391 title.prev->prev->title = get_consolidated_content(node);
392 if(verbose)
393 std::cout << "Indexing refentry: " << title.prev->prev->title << std::endl;
394 }
395 if(node->name == "anchor")
396 {
397 if(node->parent.lock()->name == "title")
398 {
399 // We have a title with a nested anchor ID, change the ID of our parents parent to match:
400 id.prev->prev->id = id.id;
401 }
402 }
403 else if(node->name == "index")
404 {
405 // Keep track of all the indexes we see:
406 indexes.push_back(node);
407 if(node->parent.lock()->name == "para")
408 node->parent.lock()->name = "";
409 }
410 else if(node->name == "primary")
411 {
412 last_primary = get_consolidated_content(node);
413 }
414 else if(node->name == "secondary")
415 {
416 last_secondary = get_consolidated_content(node);
417 }
418 else if(node->name == "tertiary")
419 {
420 last_tertiary = get_consolidated_content(node);
421 }
422 else if((node->name == "see") && internal_indexes)
423 {
424 std::cerr << "WARNING: <see> in XML source will be ignored for the index generation" << std::endl;
425 }
426 else if((node->name == "seealso") && internal_indexes)
427 {
428 std::cerr << "WARNING: <seealso> in XML source will be ignored for the index generation" << std::endl;
429 }
430
431 std::string flattenned_text;
432 const std::string* ptext;
433 if(flatten)
434 {
435 flattenned_text = unescape_xml(get_consolidated_content(node));
436 ptext = &flattenned_text;
437 //
438 // Recurse through children here if we're going to flatten the text, that way we see any processing instructions first:
439 //
440 if(!recurse_through_children(node, &id, &title, flatten || seen))
441 return;
442 }
443 else
444 {
445 ptext = &(node->content);
446 }
447
448 //
449 // Search content for items: we only search if the content is not empty,
450 // and the content is not whitespace alone, and we haven't already searched this
451 // text in one of our parent nodes that got flattened.
452 //
453 static const boost::regex space_re("[[:space:]]+");
454 if(!seen && ptext->size() && !regex_match(*ptext, space_re))
455 {
456 // Save block ID and title in case we find some hits:
457 const std::string* pid = get_current_block_id(&id);
458 const std::string& rtitle = get_current_block_title(&title);
459 const std::string simple_title = rewrite_title(rtitle, *pid);
460 // Scan for each index term:
461 for(std::set<index_info>::const_iterator i = index_terms.begin();
462 i != index_terms.end(); ++i)
463 {
464 if(regex_search(*ptext, i->search_text))
465 {
466 //
467 // We need to check to see if this term has already been indexed
468 // in this zone, in order to prevent duplicate entries, also check
469 // that any constrait placed on the term's ID is satisfied:
470 //
471 std::pair<std::string, std::string> item_index(*pid, i->term);
472 if(((no_duplicates == false) || (0 == found_terms.count(item_index)))
473 && (i->search_id.empty() || regex_match(*pid, i->search_id)))
474 {
475 // We have something to index!
476 found_terms.insert(item_index);
477
478 if(!debug.empty() && (regex_match(i->term, debug) || regex_match(rtitle, debug) || regex_match(simple_title, debug)))
479 {
480 std::cout << "Debug term found, in block with ID: " << *pid << std::endl;
481 std::cout << "Current section title is: " << rtitle << std::endl;
482 std::cout << "The main index entry will be : " << simple_title << std::endl;
483 std::cout << "The indexed term is: " << i->term << std::endl;
484 std::cout << "The search regex is: " << i->search_text << std::endl;
485 std::cout << "The section constraint is: " << i->search_id << std::endl;
486 std::cout << "The index type for this entry is: " << i->category << std::endl;
487 }
488
489 if(use_section_names && (simple_title != i->term))
490 {
491 //
492 // First off insert index entry with primary term
493 // consisting of the section title, and secondary term the
494 // actual index term, this gets skipped if the title and index
495 // term are the same:
496 //
497 if(internal_indexes == false)
498 {
499 // Insert an <indexterm> into the XML:
500 boost::tiny_xml::element_ptr p(new boost::tiny_xml::element());
501 p->name = "indexterm";
502 boost::tiny_xml::element_ptr prim(new boost::tiny_xml::element());
503 prim->name = "primary";
504 prim->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
505 prim->elements.front()->content = simple_title;
506 p->elements.push_front(prim);
507
508 boost::tiny_xml::element_ptr sec(new boost::tiny_xml::element());
509 sec->name = "secondary";
510 sec->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
511 sec->elements.front()->content = i->term;
512 p->elements.push_back(sec);
513 try{
514 // Insert the Indexterm:
515 boost::tiny_xml::element_ptr parent(node->parent);
516 while(!can_contain_indexterm(parent->name.c_str()))
517 parent = parent->parent.lock();
518 parent->elements.push_front(p);
519 }
520 catch(const std::exception&)
521 {
522 std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
523 }
524 }
525 // Track the entry in our internal index:
526 index_entry_ptr item1(new index_entry(simple_title));
527 index_entry_ptr item2(new index_entry(i->term, *pid));
528 index_entry_set::iterator pos = index_entries.insert(item1).first;
529 (**pos).sub_keys.insert(item2);
530 }
531 //
532 // Now insert another index entry with the index term
533 // as the primary key, and the section title as the
534 // secondary key, this one gets assigned to the
535 // appropriate index category if there is one:
536 //
537 bool preferred_term = false;
538 if(internal_indexes == false)
539 {
540 // Insert <indexterm> into the XML:
541 boost::tiny_xml::element_ptr p2(new boost::tiny_xml::element());
542 p2->name = "indexterm";
543 if(i->category.size())
544 {
545 p2->attributes.push_back(boost::tiny_xml::attribute("type", i->category));
546 }
547 boost::tiny_xml::element_ptr prim2(new boost::tiny_xml::element());
548 prim2->name = "primary";
549 prim2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
550 prim2->elements.front()->content = i->term;
551 p2->elements.push_front(prim2);
552
553 boost::tiny_xml::element_ptr sec2(new boost::tiny_xml::element());
554 sec2->name = "secondary";
555 sec2->elements.push_front(boost::tiny_xml::element_ptr(new boost::tiny_xml::element()));
556 sec2->elements.front()->content = rtitle;
557 p2->elements.push_back(sec2);
558 try{
559 // Insert the Indexterm:
560 boost::tiny_xml::element_ptr parent(node->parent);
561 while(!can_contain_indexterm(parent->name.c_str()))
562 {
563 // If the search text was found in a title then make it a preferred term:
564 if(parent->name == "title")
565 preferred_term = true;
566 parent = parent->parent.lock();
567 }
568 if(preferred_term)
569 {
570 boost::tiny_xml::attribute a("significance", "preferred");
571 p2->attributes.push_back(a);
572 }
573 parent->elements.push_front(p2);
574 }
575 catch(const std::exception&)
576 {
577 std::cerr << "Unable to find location to insert <indexterm>" << std::endl;
578 }
579 }
580
581 // Track the entry in our internal index:
582 try{
583 // figure out if it's preferred or not:
584 boost::tiny_xml::element_ptr parent(node->parent);
585 while(!can_contain_indexterm(parent->name.c_str()))
586 {
587 // If the search text was found in a title then make it a preferred term:
588 if(parent->name == "title")
589 {
590 preferred_term = true;
591 }
592 parent = parent->parent.lock();
593 if(!parent)
594 break;
595 }
596 }
597 catch(const std::exception&){}
598
599 index_entry_ptr item3(new index_entry(i->term));
600 if(i->category.size())
601 item3->category = i->category;
602 index_entry_ptr item4(new index_entry(rtitle, *pid));
603 item4->preferred = preferred_term;
604 index_entry_set::iterator pos = index_entries.insert(item3).first;
605 (**pos).sub_keys.insert(item4);
606 }
607 }
608 }
609 }
610 //
611 // Recurse through children, if not done already:
612 //
613 if(!flatten)
614 recurse_through_children(node, &id, &title, flatten || seen);
615 //
616 // Process manual index entries last of all:
617 //
618 if(node->name == "indexterm")
619 {
620 // Track the entry in our internal index:
621 const std::string* pid = get_current_block_id(&id);
622 const std::string* attr = find_attr(node, "type");
623 const std::string& rtitle = get_current_block_title(&title);
624 const std::string simple_title = rewrite_title(rtitle, *pid);
625 index_entry_ptr item1(new index_entry(last_primary, "", attr ? *attr : ""));
626 index_entry_set* parent = &((*index_entries.insert(item1).first)->sub_keys);
627
628 if(last_secondary.size())
629 {
630 item1.reset(new index_entry(last_secondary, "", attr ? *attr : ""));
631 parent = &((*parent->insert(item1).first)->sub_keys);
632 }
633 if(last_tertiary.size())
634 {
635 item1.reset(new index_entry(last_tertiary, "", attr ? *attr : ""));
636 parent = &((*parent->insert(item1).first)->sub_keys);
637 }
638 item1.reset(new index_entry(simple_title, *pid, attr ? *attr : ""));
639 parent->insert(item1);
640
641 last_primary = "";
642 last_secondary = "";
643 last_tertiary = "";
644 }
645 }
646
process_nodes(boost::tiny_xml::element_ptr node)647 void process_nodes(boost::tiny_xml::element_ptr node)
648 {
649 node_id id = { 0, };
650 title_info t = { "", 0 };
651 process_node(node, &id, &t);
652 }
653
main(int argc,char * argv[])654 int main(int argc, char* argv[])
655 {
656 try{
657
658 namespace po = boost::program_options;
659 po::options_description desc("AutoIndex Allowed Options");
660 desc.add_options()
661 ("help", "Print help message")
662 ("in", po::value<std::string>(), "Set the input XML file.")
663 ("out", po::value<std::string>(), "Set output input XML file.")
664 ("scan", po::value<std::string>(), "Scan the specified file for terms to try and index.")
665 ("script", po::value<std::string>(), "Specifies the script file to use.")
666 ("no-duplicates", "Prevents duplicate index entries within the same section.")
667 ("no-section-names", "Suppresses use of section names as index entries.")
668 ("internal-index", "Causes AutoIndex to generate the index itself, rather than relying on the XSL stylesheets.")
669 ("verbose", "Turns on verbose mode.")
670 ("prefix", po::value<std::string>(), "Sets the prefix to be prepended to all file names and paths in the script file.")
671 ("index-type", po::value<std::string>(), "Sets the XML container type to use the index.")
672 ;
673
674 po::variables_map vm;
675 po::store(po::parse_command_line(argc, argv, desc), vm);
676 po::notify(vm);
677
678 //
679 // Process arguments:
680 //
681 if(vm.count("help"))
682 {
683 std::cout << desc;
684 return 0;
685 }
686 if(vm.count("in"))
687 {
688 infile = vm["in"].as<std::string>();
689 }
690 else
691 {
692 std::cerr << "No input XML file specified" << std::endl;
693 return 1;
694 }
695 if(vm.count("out"))
696 {
697 outfile = vm["out"].as<std::string>();
698 }
699 else
700 {
701 std::cerr << "No output XML file specified" << std::endl;
702 return 1;
703 }
704 if(vm.count("verbose"))
705 {
706 verbose = true;
707 }
708 if(vm.count("prefix"))
709 {
710 prefix = vm["prefix"].as<std::string>();
711 }
712 if(vm.count("scan"))
713 {
714 std::string f = vm["scan"].as<std::string>();
715 if(!exists(boost::filesystem::path(f)))
716 throw std::runtime_error("Error the file requested for scanning does not exist: " + f);
717 scan_file(f);
718 }
719 if(vm.count("script"))
720 {
721 process_script(vm["script"].as<std::string>());
722 }
723 if(vm.count("no-duplicates"))
724 {
725 no_duplicates = true;
726 }
727 if(vm.count("no-section-names"))
728 {
729 use_section_names = false;
730 }
731 if(vm.count("internal-index"))
732 {
733 internal_indexes = true;
734 }
735 if(vm.count("index-type"))
736 {
737 internal_index_type = vm["index-type"].as<std::string>();
738 }
739
740 std::ifstream is(infile.c_str());
741 if((0 == is.peek()) || !is.good())
742 {
743 std::cerr << "Unable to open XML data file " << argv[1] << std::endl;
744 return 1;
745 }
746 //
747 // We need to skip any leading <? and <! elements:
748 //
749 std::string header = get_header(is);
750 boost::tiny_xml::element_ptr xml = boost::tiny_xml::parse(is, "");
751 is.close();
752
753 std::cout << "Indexing " << index_terms.size() << " terms..." << std::endl;
754
755 process_nodes(xml);
756
757 if(internal_indexes)
758 generate_indexes();
759
760 std::ofstream os(outfile.c_str());
761 os << header << std::endl;
762 boost::tiny_xml::write(*xml, os);
763 std::cout << index_entries.size() << " Index entries were created." << std::endl;
764
765 }
766 catch(boost::exception& e)
767 {
768 std::cerr << diagnostic_information(e);
769 return 1;
770 }
771 catch(const std::exception& e)
772 {
773 std::cerr << e.what() << std::endl;
774 return 1;
775 }
776 catch(const std::string& s)
777 {
778 std::cerr << s << std::endl;
779 return 1;
780 }
781
782 return 0;
783 }
784