1 // link_check implementation -----------------------------------------------// 2 3 // Copyright Beman Dawes 2002. 4 // 5 // Distributed under the Boost Software License, Version 1.0. 6 // (See accompanying file LICENSE_1_0.txt or copy at 7 // http://www.boost.org/LICENSE_1_0.txt) 8 9 #include "link_check.hpp" 10 #include "boost/regex.hpp" 11 #include "boost/filesystem/operations.hpp" 12 #include <boost/algorithm/string/case_conv.hpp> 13 #include <cstdlib> 14 #include <set> 15 16 // #include <iostream> 17 18 namespace fs = boost::filesystem; 19 20 namespace 21 { 22 boost::regex html_bookmark_regex( 23 "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3" 24 "|<!--.*?-->", 25 boost::regbase::normal | boost::regbase::icase); 26 boost::regex html_url_regex( 27 "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)" // HREF or SRC 28 "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2" 29 "|<!--.*?-->", 30 boost::regbase::normal | boost::regbase::icase); 31 boost::regex css_url_regex( 32 "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)" 33 "|/\\*.*?\\*/", 34 boost::regbase::normal | boost::regbase::icase); 35 36 // Regular expression for parsing URLS from: 37 // http://tools.ietf.org/html/rfc3986#appendix-B 38 boost::regex url_decompose_regex( 39 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$", 40 boost::regbase::normal); 41 42 typedef std::set<std::string> bookmark_set; 43 bookmark_set bookmarks; 44 bookmark_set bookmarks_lowercase; // duplicate check needs case insensitive 45 46 // Decode html escapsed ampersands, returns an empty string if there's an error. decode_ampersands(std::string const & url_path)47 std::string decode_ampersands(std::string const& url_path) { 48 std::string::size_type pos = 0, next; 49 std::string result; 50 result.reserve(url_path.length()); 51 52 while((next = url_path.find('&', pos)) != std::string::npos) { 53 result.append(url_path, pos, next - pos); 54 pos = next; 55 if(url_path.substr(pos, 5) == "&") { 56 result += '&'; pos += 5; 57 } 58 else { 59 result += '&'; pos += 1; 60 } 61 break; 62 } 63 64 result.append(url_path, pos, url_path.length()); 65 66 return result; 67 } 68 69 // Decode percent encoded characters, returns an empty string if there's an error. decode_percents(std::string const & url_path)70 std::string decode_percents(std::string const& url_path) { 71 std::string::size_type pos = 0, next; 72 std::string result; 73 result.reserve(url_path.length()); 74 75 while((next = url_path.find('%', pos)) != std::string::npos) { 76 result.append(url_path, pos, next - pos); 77 pos = next; 78 switch(url_path[pos]) { 79 case '%': { 80 if(url_path.length() - next < 3) return ""; 81 char hex[3] = { url_path[next + 1], url_path[next + 2], '\0' }; 82 char* end_ptr; 83 result += (char) std::strtol(hex, &end_ptr, 16); 84 if(*end_ptr) return ""; 85 pos = next + 3; 86 break; 87 } 88 } 89 } 90 91 result.append(url_path, pos, url_path.length()); 92 93 return result; 94 } 95 is_css(const path & p)96 bool is_css(const path & p) { 97 return p.extension() == ".css"; 98 } 99 100 } // unnamed namespace 101 102 namespace boost 103 { 104 namespace inspect 105 { 106 107 // link_check constructor --------------------------------------------------// 108 link_check()109 link_check::link_check() 110 : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0), 111 m_bookmark_errors(0), m_duplicate_bookmark_errors(0) 112 { 113 // HTML signatures are already registered by the base class, 114 // 'hypertext_inspector' 115 register_signature(".css"); 116 } 117 118 // inspect (all) -----------------------------------------------------------// 119 inspect(const string &,const path & full_path)120 void link_check::inspect( 121 const string & /*library_name*/, 122 const path & full_path ) 123 { 124 // keep track of paths already encountered to reduce disk activity 125 if ( !fs::is_directory( full_path ) ) 126 m_paths[ relative_to( full_path, search_root_path() ) ] |= m_present; 127 } 128 129 // inspect ( .htm, .html, .shtml, .css ) -----------------------------------// 130 inspect(const string & library_name,const path & full_path,const string & contents)131 void link_check::inspect( 132 const string & library_name, 133 const path & full_path, // example: c:/foo/boost/filesystem/path.hpp 134 const string & contents ) // contents of file to be inspected 135 { 136 if (contents.find( "boostinspect:" "nounlinked" ) != string::npos) 137 m_paths[ relative_to( full_path, search_root_path() ) ] |= m_nounlinked_errors; 138 139 bool no_link_errors = 140 (contents.find( "boostinspect:" "nolink" ) != string::npos); 141 142 // build bookmarks databases 143 bookmarks.clear(); 144 bookmarks_lowercase.clear(); 145 string::const_iterator a_start( contents.begin() ); 146 string::const_iterator a_end( contents.end() ); 147 boost::match_results< string::const_iterator > a_what; 148 boost::match_flag_type a_flags = boost::match_default; 149 150 if(!is_css(full_path)) 151 { 152 string previous_id; 153 154 while( boost::regex_search( a_start, a_end, a_what, html_bookmark_regex, a_flags) ) 155 { 156 // a_what[0] contains the whole string iterators. 157 // a_what[1] contains the tag iterators. 158 // a_what[2] contains the attribute name. 159 // a_what[4] contains the bookmark iterators. 160 161 if (a_what[4].matched) 162 { 163 string tag( a_what[1].first, a_what[1].second ); 164 boost::algorithm::to_lower(tag); 165 string attribute( a_what[2].first, a_what[2].second ); 166 boost::algorithm::to_lower(attribute); 167 string bookmark( a_what[4].first, a_what[4].second ); 168 169 bool name_following_id = ( attribute == "name" && previous_id == bookmark ); 170 if ( tag != "meta" && attribute == "id" ) previous_id = bookmark; 171 else previous_id.clear(); 172 173 if ( tag != "meta" && !name_following_id ) 174 { 175 bookmarks.insert( bookmark ); 176 // std::cout << "******************* " << bookmark << '\n'; 177 178 // w3.org recommends case-insensitive checking for duplicate bookmarks 179 // since some browsers do a case-insensitive match. 180 string bookmark_lowercase( bookmark ); 181 boost::algorithm::to_lower(bookmark_lowercase); 182 183 std::pair<bookmark_set::iterator, bool> result 184 = bookmarks_lowercase.insert( bookmark_lowercase ); 185 if (!result.second) 186 { 187 ++m_duplicate_bookmark_errors; 188 int ln = std::count( contents.begin(), a_what[3].first, '\n' ) + 1; 189 error( library_name, full_path, "Duplicate bookmark: " + bookmark, ln ); 190 } 191 } 192 } 193 194 a_start = a_what[0].second; // update search position 195 a_flags |= boost::match_prev_avail; // update flags 196 a_flags |= boost::match_not_bob; 197 } 198 } 199 200 // process urls 201 string::const_iterator start( contents.begin() ); 202 string::const_iterator end( contents.end() ); 203 boost::match_results< string::const_iterator > what; 204 boost::match_flag_type flags = boost::match_default; 205 206 if(!is_css(full_path)) 207 { 208 while( boost::regex_search( start, end, what, html_url_regex, flags) ) 209 { 210 // what[0] contains the whole string iterators. 211 // what[1] contains the element type iterators. 212 // what[3] contains the URL iterators. 213 214 if(what[3].matched) 215 { 216 string type( what[1].first, what[1].second ); 217 boost::algorithm::to_lower(type); 218 219 // TODO: Complain if 'link' tags use external stylesheets. 220 do_url( string( what[3].first, what[3].second ), 221 library_name, full_path, no_link_errors, 222 type == "a" || type == "link", contents.begin(), what[3].first ); 223 } 224 225 start = what[0].second; // update search position 226 flags |= boost::match_prev_avail; // update flags 227 flags |= boost::match_not_bob; 228 } 229 } 230 231 while( boost::regex_search( start, end, what, css_url_regex, flags) ) 232 { 233 // what[0] contains the whole string iterators. 234 // what[2] contains the URL iterators. 235 236 if(what[2].matched) 237 { 238 do_url( string( what[2].first, what[2].second ), 239 library_name, full_path, no_link_errors, false, 240 contents.begin(), what[3].first ); 241 } 242 243 start = what[0].second; // update search position 244 flags |= boost::match_prev_avail; // update flags 245 flags |= boost::match_not_bob; 246 } 247 } 248 249 // do_url ------------------------------------------------------------------// 250 do_url(const string & url,const string & library_name,const path & source_path,bool no_link_errors,bool allow_external_content,std::string::const_iterator contents_begin,std::string::const_iterator url_start)251 void link_check::do_url( const string & url, const string & library_name, 252 const path & source_path, bool no_link_errors, bool allow_external_content, 253 std::string::const_iterator contents_begin, std::string::const_iterator url_start ) 254 // precondition: source_path.is_complete() 255 { 256 if(!no_link_errors && url.empty()) { 257 ++m_invalid_errors; 258 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 259 error( library_name, source_path, "Empty URL.", ln ); 260 return; 261 } 262 263 // Decode ampersand encoded characters. 264 string decoded_url = is_css(source_path) ? url : decode_ampersands(url); 265 if(decoded_url.empty()) { 266 if(!no_link_errors) { 267 ++m_invalid_errors; 268 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 269 error( library_name, source_path, 270 "Invalid URL (invalid ampersand encodings): " + url, ln ); 271 } 272 return; 273 } 274 275 boost::smatch m; 276 if(!boost::regex_match(decoded_url, m, url_decompose_regex)) { 277 if(!no_link_errors) { 278 ++m_invalid_errors; 279 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 280 error( library_name, source_path, "Invalid URL: " + decoded_url, ln ); 281 } 282 return; 283 } 284 285 bool scheme_matched = m[2].matched, 286 authority_matched = m[4].matched, 287 //query_matched = m[7].matched, 288 fragment_matched = m[9].matched; 289 290 std::string scheme(m[2]), 291 authority(m[4]), 292 url_path(m[5]), 293 //query(m[7]), 294 fragment(m[9]); 295 296 // Check for external content 297 if(!allow_external_content && (authority_matched || scheme_matched)) { 298 if(!no_link_errors) { 299 ++m_invalid_errors; 300 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 301 error( library_name, source_path, "External content: " + decoded_url, ln ); 302 } 303 } 304 305 // Protocol checks 306 if(scheme_matched) { 307 if(scheme == "http" || scheme == "https") { 308 // All http links should have a hostname. Generally if they don't 309 // it's by mistake. If they shouldn't, then a protocol isn't 310 // required. 311 if(!authority_matched) { 312 if(!no_link_errors) { 313 ++m_invalid_errors; 314 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 315 error( library_name, source_path, "No hostname: " + decoded_url, ln ); 316 } 317 } 318 319 return; 320 } 321 else if(scheme == "file") { 322 if(!no_link_errors) { 323 ++m_invalid_errors; 324 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 325 error( library_name, source_path, 326 "Invalid URL (hardwired file): " + decoded_url, ln ); 327 } 328 } 329 else if(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript") { 330 if ( !no_link_errors && is_css(source_path) ) { 331 ++m_invalid_errors; 332 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 333 error( library_name, source_path, 334 "Invalid protocol for css: " + decoded_url, ln ); 335 } 336 } 337 else { 338 if(!no_link_errors) { 339 ++m_invalid_errors; 340 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 341 error( library_name, source_path, "Unknown protocol: '" + scheme + "' in url: " + decoded_url, ln ); 342 } 343 } 344 345 return; 346 } 347 348 // Hostname without protocol. 349 if(authority_matched) { 350 if(!no_link_errors) { 351 ++m_invalid_errors; 352 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 353 error( library_name, source_path, 354 "Invalid URL (hostname without protocol): " + decoded_url, ln ); 355 } 356 } 357 358 // Check the fragment identifier 359 if ( fragment_matched ) { 360 if ( is_css(source_path) ) { 361 if ( !no_link_errors ) { 362 ++m_invalid_errors; 363 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 364 error( library_name, source_path, 365 "Fragment link in CSS: " + decoded_url, ln ); 366 } 367 } 368 else { 369 if ( !no_link_errors && fragment.find( '#' ) != string::npos ) 370 { 371 ++m_bookmark_errors; 372 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 373 error( library_name, source_path, "Invalid bookmark: " + decoded_url, ln ); 374 } 375 else if ( !no_link_errors && url_path.empty() && !fragment.empty() 376 // w3.org recommends case-sensitive broken bookmark checking 377 // since some browsers do a case-sensitive match. 378 && bookmarks.find(decode_percents(fragment)) == bookmarks.end() ) 379 { 380 ++m_broken_errors; 381 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 382 error( library_name, source_path, "Unknown bookmark: " + decoded_url, ln ); 383 } 384 } 385 386 // No more to do if it's just a fragment identifier 387 if(url_path.empty()) return; 388 } 389 390 // Detect characters banned by RFC2396: 391 if ( !no_link_errors && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos ) 392 { 393 ++m_invalid_errors; 394 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 395 error( library_name, source_path, 396 "Invalid character in URL: " + decoded_url, ln ); 397 } 398 399 // Check that we actually have a path. 400 if(url_path.empty()) { 401 if(!no_link_errors) { 402 ++m_invalid_errors; 403 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 404 error( library_name, source_path, 405 "Invalid URL (empty path in relative url): " + decoded_url, ln ); 406 } 407 } 408 409 // Decode percent encoded characters. 410 string decoded_path = decode_percents(url_path); 411 if(decoded_path.empty()) { 412 if(!no_link_errors) { 413 ++m_invalid_errors; 414 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 415 error( library_name, source_path, 416 "Invalid URL (invalid character encodings): " + decoded_url, ln ); 417 } 418 return; 419 } 420 421 // strip url of references to current dir 422 if ( decoded_path[0]=='.' && decoded_path[1]=='/' ) decoded_path.erase( 0, 2 ); 423 424 // url is relative source_path.branch() 425 // convert to target_path, which is_complete() 426 path target_path; 427 try { target_path = source_path.branch_path() /= path( decoded_path ); } 428 catch ( const fs::filesystem_error & ) 429 { 430 if(!no_link_errors) { 431 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 432 ++m_invalid_errors; 433 error( library_name, source_path, 434 "Invalid URL (error resolving path): " + decoded_url, ln ); 435 } 436 return; 437 } 438 439 // create a m_paths entry if necessary 440 std::pair< const string, int > entry( 441 relative_to( target_path, search_root_path() ), 0 ); 442 m_path_map::iterator itr( m_paths.find( entry.first ) ); 443 if ( itr == m_paths.end() ) 444 { 445 if ( fs::exists( target_path ) ) entry.second = m_present; 446 itr = m_paths.insert( entry ).first; 447 } 448 449 // itr now points to the m_paths entry 450 itr->second |= m_linked_to; 451 452 // if target isn't present, the link is broken 453 if ( !no_link_errors && (itr->second & m_present) == 0 ) 454 { 455 ++m_broken_errors; 456 int ln = std::count( contents_begin, url_start, '\n' ) + 1; 457 error( library_name, source_path, "Broken link: " + decoded_url, ln ); 458 } 459 } 460 461 // close -------------------------------------------------------------------// 462 close()463 void link_check::close() 464 { 465 for ( m_path_map::const_iterator itr = m_paths.begin(); 466 itr != m_paths.end(); ++itr ) 467 { 468 // std::clog << itr->first << " " << itr->second << "\n"; 469 if ( (itr->second & m_linked_to) != m_linked_to 470 && (itr->second & m_nounlinked_errors) != m_nounlinked_errors 471 && (itr->first.rfind( ".html" ) == itr->first.size()-5 472 || itr->first.rfind( ".htm" ) == itr->first.size()-4 473 || itr->first.rfind( ".css" ) == itr->first.size()-4) 474 // because they may be redirectors, it is OK if these are unlinked: 475 && itr->first.rfind( "index.html" ) == string::npos 476 && itr->first.rfind( "index.htm" ) == string::npos ) 477 { 478 ++m_unlinked_errors; 479 path full_path( search_root_path() / path(itr->first) ); 480 error( impute_library( full_path ), full_path, "Unlinked file" ); 481 } 482 } 483 } 484 485 } // namespace inspect 486 } // namespace boost 487 488