/*============================================================================= Copyright (c) 2005 2006 Joel de Guzman http://spirit.sourceforge.net/ Use, modification and distribution is subject to the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) =============================================================================*/ #include "post_process.hpp" #include #include #include #include #include #include #include namespace quickbook { namespace cl = boost::spirit::classic; namespace ph = phoenix; typedef std::string::const_iterator iter_type; struct pretty_printer { pretty_printer(std::string& out_, int& current_indent_, int linewidth_) : prev(0) , out(out_) , current_indent(current_indent_) , column(0) , in_string(false) , linewidth(linewidth_) { } void indent() { BOOST_ASSERT(current_indent >= 0); // this should not happen! for (int i = 0; i < current_indent; ++i) out += ' '; column = current_indent; } void trim_spaces() { out.erase(out.find_last_not_of(' ') + 1); // trim trailing spaces } void break_line() { trim_spaces(); out += '\n'; indent(); } bool line_is_empty() const { for (iter_type i = out.end() - (column - current_indent); i != out.end(); ++i) { if (*i != ' ') return false; } return true; } void align_indent() { // make sure we are at the proper indent position if (column != current_indent) { if (column > current_indent) { if (line_is_empty()) { // trim just enough trailing spaces down to // current_indent position out.erase( out.end() - (column - current_indent), out.end()); column = current_indent; } else { // nope, line is not empty. do a hard CR break_line(); } } else { // will this happen? (i.e. column <= current_indent) while (column != current_indent) { out += ' '; ++column; } } } } void print(char ch) { // Print a char. Attempt to break the line if we are exceeding // the target linewidth. The linewidth is not an absolute limit. // There are many cases where a line will exceed the linewidth // and there is no way to properly break the line. Preformatted // code that exceeds the linewidth are examples. We cannot break // preformatted code. We shall not attempt to be very strict with // line breaking. What's more important is to have a reproducable // output (i.e. processing two logically equivalent xml files // results in two lexically equivalent xml files). *** pretty // formatting is a secondary goal *** // Strings will occur only in tag attributes. Normal content // will have " instead. We shall deal only with tag // attributes here. if (ch == '"') in_string = !in_string; // don't break strings! if (!in_string && std::isspace(static_cast(ch))) { // we can break spaces if they are not inside strings if (!std::isspace(static_cast(prev))) { if (column >= linewidth) { break_line(); if (column == 0 && ch == ' ') { ++column; out += ' '; } } else { ++column; out += ' '; } } } else { // we can break tag boundaries and stuff after // delimiters if they are not inside strings // and *only-if* the preceding char is a space if (!in_string && column >= linewidth && (ch == '<' && std::isspace(static_cast(prev)))) break_line(); out += ch; ++column; } prev = ch; } void print(iter_type f, iter_type l) { for (iter_type i = f; i != l; ++i) print(*i); } void print_tag(iter_type f, iter_type l, bool is_flow_tag) { if (is_flow_tag) { print(f, l); } else { // This is not a flow tag, so, we're going to do a // carriage return anyway. Let us remove extra right // spaces. std::string str(f, l); BOOST_ASSERT(f != l); // this should not happen iter_type i = str.end(); while (i != str.begin() && std::isspace(static_cast(*(i - 1)))) --i; print(str.begin(), i); } } char prev; std::string& out; int& current_indent; int column; bool in_string; int linewidth; private: pretty_printer& operator=(pretty_printer const&); }; char const* html_block_tags_[] = { "div", "p", "blockquote", "address", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li", "dl", "dt", "dd", "table", "tr", "th", "td", "tbody", "thead", "form", "fieldset", "hr", "noscript", "html", "body"}; char const* block_tags_[] = { "author", "blockquote", "bridgehead", "callout", "calloutlist", "caution", "copyright", "entry", "important", "informaltable", "itemizedlist", "legalnotice", "listitem", "note", "orderedlist", "para", "row", "section", "simpara", "table", "tbody", "textobject", "tgroup", "thead", "tip", "variablelist", "varlistentry", "warning", "xml", "xi:include"}; char const* doc_types_[] = {"book", "article", "library", "chapter", "part", "appendix", "preface", "qandadiv", "qandaset", "reference", "set"}; struct tidy_compiler { tidy_compiler(std::string& out_, int linewidth_, bool is_html) : out(out_) , current_indent(0) , printer(out_, current_indent, linewidth_) { if (is_html) { static std::size_t const n_block_tags = sizeof(html_block_tags_) / sizeof(char const*); for (std::size_t i = 0; i != n_block_tags; ++i) { block_tags.insert(html_block_tags_[i]); } } else { static std::size_t const n_block_tags = sizeof(block_tags_) / sizeof(char const*); for (std::size_t i = 0; i != n_block_tags; ++i) { block_tags.insert(block_tags_[i]); } static std::size_t const n_doc_types = sizeof(doc_types_) / sizeof(char const*); for (std::size_t i = 0; i != n_doc_types; ++i) { block_tags.insert(doc_types_[i]); block_tags.insert(doc_types_[i] + std::string("info")); block_tags.insert(doc_types_[i] + std::string("purpose")); } } } bool is_flow_tag(std::string const& tag) { return block_tags.find(tag) == block_tags.end(); } std::set block_tags; std::stack tags; std::string& out; int current_indent; pretty_printer printer; std::string current_tag; private: tidy_compiler& operator=(tidy_compiler const&); }; struct tidy_grammar : cl::grammar { tidy_grammar(tidy_compiler& state_, int indent_, bool is_html_) : state(state_), indent(indent_), is_html(is_html_) { } template struct definition { definition(tidy_grammar const& self) { // clang-format off tag = (cl::lexeme_d[+(cl::alnum_p | '_' | ':')]) [boost::bind(&tidy_grammar::do_tag, &self, _1, _2)]; code = cl::eps_p(ph::var(self.is_html)) >> "<" >> cl::lexeme_d[cl::str_p("pre")] >> *(cl::anychar_p - '>') >> ">" >> *(cl::anychar_p - "") >> "> cl::lexeme_d[">" >> *cl::space_p] | cl::eps_p(!ph::var(self.is_html)) >> "" >> *(cl::anychar_p - "") >> "> cl::lexeme_d[">" >> *cl::space_p] ; // What's the business of cl::lexeme_d['>' >> *cl::space_p]; ? // It is there to preserve the space after the tag that is // otherwise consumed by the cl::space_p skipper. escape = cl::str_p("") >> (*(cl::anychar_p - cl::str_p(""))) [ boost::bind(&tidy_grammar::do_escape, &self, _1, _2) ] >> cl::lexeme_d [ cl::str_p("") >> (*cl::space_p) [ boost::bind(&tidy_grammar::do_escape_post, &self, _1, _2) ] ] ; start_tag = '<' >> tag >> *(cl::anychar_p - '>') >> cl::lexeme_d['>' >> *cl::space_p]; start_end_tag = '<' >> tag >> *(cl::anychar_p - ("/>" | cl::ch_p('>'))) >> cl::lexeme_d["/>" >> *cl::space_p] | "> tag >> *(cl::anychar_p - '?') >> cl::lexeme_d["?>" >> *cl::space_p] | "") >> cl::lexeme_d["-->" >> *cl::space_p] | "> tag >> *(cl::anychar_p - '>') >> cl::lexeme_d['>' >> *cl::space_p] ; content = cl::lexeme_d[ +(cl::anychar_p - '<') ]; end_tag = "> +(cl::anychar_p - '>') >> cl::lexeme_d['>' >> *cl::space_p]; markup = escape | code [boost::bind(&tidy_grammar::do_code, &self, _1, _2)] | start_end_tag [boost::bind(&tidy_grammar::do_start_end_tag, &self, _1, _2)] | start_tag [boost::bind(&tidy_grammar::do_start_tag, &self, _1, _2)] | end_tag [boost::bind(&tidy_grammar::do_end_tag, &self, _1, _2)] | content [boost::bind(&tidy_grammar::do_content, &self, _1, _2)] ; tidy = *markup; // clang-format on } cl::rule const& start() { return tidy; } cl::rule tidy, tag, start_tag, start_end_tag, content, end_tag, markup, code, escape; }; void do_escape_post(iter_type f, iter_type l) const { for (iter_type i = f; i != l; ++i) state.out += *i; } void do_escape(iter_type f, iter_type l) const { while (f != l && std::isspace(*f)) { ++f; } while (f != l && std::isspace(*(l - 1))) { --l; } for (iter_type i = f; i != l; ++i) { state.out += *i; } } void do_code(iter_type f, iter_type l) const { state.printer.trim_spaces(); if (state.out[state.out.size() - 1] != '\n') state.out += '\n'; // trim trailing space from after closing tag while (f != l && std::isspace(*(l - 1))) { --l; } // print the string taking care of line // ending CR/LF platform issues for (iter_type i = f; i != l;) { if (*i == '\n') { state.printer.trim_spaces(); state.out += '\n'; ++i; if (i != l && *i == '\r') { ++i; } } else if (*i == '\r') { state.printer.trim_spaces(); state.out += '\n'; ++i; if (i != l && *i == '\n') { ++i; } } else { state.out += *i; ++i; } } state.out += '\n'; state.printer.indent(); } void do_tag(iter_type f, iter_type l) const { state.current_tag = std::string(f, l); } void do_start_end_tag(iter_type f, iter_type l) const { bool is_flow_tag = state.is_flow_tag(state.current_tag); if (!is_flow_tag) state.printer.align_indent(); state.printer.print_tag(f, l, is_flow_tag); if (!is_flow_tag) state.printer.break_line(); } void do_start_tag(iter_type f, iter_type l) const { state.tags.push(state.current_tag); bool is_flow_tag = state.is_flow_tag(state.current_tag); if (!is_flow_tag) state.printer.align_indent(); state.printer.print_tag(f, l, is_flow_tag); if (!is_flow_tag) { state.current_indent += indent; state.printer.break_line(); } } void do_content(iter_type f, iter_type l) const { state.printer.print(f, l); } void do_end_tag(iter_type f, iter_type l) const { if (state.tags.empty()) throw quickbook::post_process_failure("Mismatched tags."); bool is_flow_tag = state.is_flow_tag(state.tags.top()); if (!is_flow_tag) { state.current_indent -= indent; state.printer.align_indent(); } state.printer.print_tag(f, l, is_flow_tag); if (!is_flow_tag) state.printer.break_line(); state.tags.pop(); } tidy_compiler& state; int indent; bool is_html; private: tidy_grammar& operator=(tidy_grammar const&); }; std::string post_process( std::string const& in, int indent, int linewidth, bool is_html) { if (indent == -1) indent = 2; // set default to 2 if (linewidth == -1) linewidth = 80; // set default to 80 std::string tidy; tidy_compiler state(tidy, linewidth, is_html); tidy_grammar g(state, indent, is_html); cl::parse_info r = parse(in.begin(), in.end(), g, cl::space_p); if (r.full) { return tidy; } else { throw quickbook::post_process_failure("Post Processing Failed."); } } }