1/* 2 * Copyright (C) 2005, 2007, 2008 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov (ap@nypop.com) 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of 15 * its contributors may be used to endorse or promote products derived 16 * from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30#import "WebNSURLExtras.h" 31 32#import "WebKitNSStringExtras.h" 33#import "WebLocalizableStrings.h" 34#import "WebNSDataExtras.h" 35#import "WebNSObjectExtras.h" 36#import "WebSystemInterface.h" 37#import <Foundation/NSURLRequest.h> 38#import <WebCore/KURL.h> 39#import <WebCore/LoaderNSURLExtras.h> 40#import <WebKitSystemInterface.h> 41#import <wtf/Assertions.h> 42#import <unicode/uchar.h> 43#import <unicode/uidna.h> 44#import <unicode/uscript.h> 45 46using namespace WebCore; 47using namespace WTF; 48 49typedef void (* StringRangeApplierFunction)(NSString *string, NSRange range, void *context); 50 51// Needs to be big enough to hold an IDN-encoded name. 52// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. 53#define HOST_NAME_BUFFER_LENGTH 2048 54 55#define URL_BYTES_BUFFER_LENGTH 2048 56 57static pthread_once_t IDNScriptWhiteListFileRead = PTHREAD_ONCE_INIT; 58static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32]; 59 60static inline BOOL isLookalikeCharacter(int charCode) 61{ 62// FIXME: Move this code down into WebCore so it can be shared with other platforms. 63 64// This function treats the following as unsafe, lookalike characters: 65// any non-printable character, any character considered as whitespace that isn't already converted to a space by ICU, 66// and any ignorable character. 67 68// We also considered the characters in Mozilla's blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars), 69// and included all of these characters that ICU can encode. 70 71 if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) 72 return YES; 73 74 switch (charCode) { 75 case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */ 76 case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */ 77 case 0x0251: /* LATIN SMALL LETTER ALPHA */ 78 case 0x0261: /* LATIN SMALL LETTER SCRIPT G */ 79 case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */ 80 case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */ 81 case 0x05B4: /* HEBREW POINT HIRIQ */ 82 case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */ 83 case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */ 84 case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */ 85 case 0x0660: /* ARABIC INDIC DIGIT ZERO */ 86 case 0x06D4: /* ARABIC FULL STOP */ 87 case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */ 88 case 0x2027: /* HYPHENATION POINT */ 89 case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ 90 case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ 91 case 0x2044: /* FRACTION SLASH */ 92 case 0x2215: /* DIVISION SLASH */ 93 case 0x2216: /* SET MINUS */ 94 case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */ 95 case 0x23AE: /* INTEGRAL EXTENSION */ 96 case 0x244A: /* OCR DOUBLE BACKSLASH */ 97 case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */ 98 case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */ 99 case 0x29F8: /* BIG SOLIDUS */ 100 case 0x29f6: /* SOLIDUS WITH OVERBAR */ 101 case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */ 102 case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */ 103 case 0x3008: /* LEFT ANGLE BRACKET */ 104 case 0x3014: /* LEFT TORTOISE SHELL BRACKET */ 105 case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */ 106 case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */ 107 case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */ 108 case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */ 109 case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */ 110 case 0x33DF: /* SQUARE A OVER M */ 111 case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */ 112 case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ 113 case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */ 114 case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */ 115 case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */ 116 return YES; 117 default: 118 return NO; 119 } 120} 121 122static char hexDigit(int i) 123{ 124 if (i < 0 || i > 16) { 125 LOG_ERROR("illegal hex digit"); 126 return '0'; 127 } 128 int h = i; 129 if (h >= 10) { 130 h = h - 10 + 'A'; 131 } 132 else { 133 h += '0'; 134 } 135 return h; 136} 137 138static BOOL isHexDigit(char c) 139{ 140 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); 141} 142 143static int hexDigitValue(char c) 144{ 145 if (c >= '0' && c <= '9') { 146 return c - '0'; 147 } 148 if (c >= 'A' && c <= 'F') { 149 return c - 'A' + 10; 150 } 151 if (c >= 'a' && c <= 'f') { 152 return c - 'a' + 10; 153 } 154 LOG_ERROR("illegal hex digit"); 155 return 0; 156} 157 158static void applyHostNameFunctionToMailToURLString(NSString *string, StringRangeApplierFunction f, void *context) 159{ 160 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character. 161 // Skip quoted strings so that characters in them don't confuse us. 162 // When we find a '?' character, we are past the part of the URL that contains host names. 163 164 static NSCharacterSet *hostNameOrStringStartCharacters; 165 if (hostNameOrStringStartCharacters == nil) { 166 hostNameOrStringStartCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"@?"]; 167 CFRetain(hostNameOrStringStartCharacters); 168 } 169 static NSCharacterSet *hostNameEndCharacters; 170 if (hostNameEndCharacters == nil) { 171 hostNameEndCharacters = [NSCharacterSet characterSetWithCharactersInString:@">,?"]; 172 CFRetain(hostNameEndCharacters); 173 } 174 static NSCharacterSet *quotedStringCharacters; 175 if (quotedStringCharacters == nil) { 176 quotedStringCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"\\"]; 177 CFRetain(quotedStringCharacters); 178 } 179 180 unsigned stringLength = [string length]; 181 NSRange remaining = NSMakeRange(0, stringLength); 182 183 while (1) { 184 // Find start of host name or of quoted string. 185 NSRange hostNameOrStringStart = [string rangeOfCharacterFromSet:hostNameOrStringStartCharacters options:0 range:remaining]; 186 if (hostNameOrStringStart.location == NSNotFound) { 187 return; 188 } 189 unichar c = [string characterAtIndex:hostNameOrStringStart.location]; 190 remaining.location = NSMaxRange(hostNameOrStringStart); 191 remaining.length = stringLength - remaining.location; 192 193 if (c == '?') { 194 return; 195 } 196 197 if (c == '@') { 198 // Find end of host name. 199 unsigned hostNameStart = remaining.location; 200 NSRange hostNameEnd = [string rangeOfCharacterFromSet:hostNameEndCharacters options:0 range:remaining]; 201 BOOL done; 202 if (hostNameEnd.location == NSNotFound) { 203 hostNameEnd.location = stringLength; 204 done = YES; 205 } else { 206 remaining.location = hostNameEnd.location; 207 remaining.length = stringLength - remaining.location; 208 done = NO; 209 } 210 211 // Process host name range. 212 f(string, NSMakeRange(hostNameStart, hostNameEnd.location - hostNameStart), context); 213 214 if (done) { 215 return; 216 } 217 } else { 218 // Skip quoted string. 219 ASSERT(c == '"'); 220 while (1) { 221 NSRange escapedCharacterOrStringEnd = [string rangeOfCharacterFromSet:quotedStringCharacters options:0 range:remaining]; 222 if (escapedCharacterOrStringEnd.location == NSNotFound) { 223 return; 224 } 225 c = [string characterAtIndex:escapedCharacterOrStringEnd.location]; 226 remaining.location = NSMaxRange(escapedCharacterOrStringEnd); 227 remaining.length = stringLength - remaining.location; 228 229 // If we are the end of the string, then break from the string loop back to the host name loop. 230 if (c == '"') { 231 break; 232 } 233 234 // Skip escaped character. 235 ASSERT(c == '\\'); 236 if (remaining.length == 0) { 237 return; 238 } 239 remaining.location += 1; 240 remaining.length -= 1; 241 } 242 } 243 } 244} 245 246static void applyHostNameFunctionToURLString(NSString *string, StringRangeApplierFunction f, void *context) 247{ 248 // Find hostnames. Too bad we can't use any real URL-parsing code to do this, 249 // but we have to do it before doing all the %-escaping, and this is the only 250 // code we have that parses mailto URLs anyway. 251 252 // Maybe we should implement this using a character buffer instead? 253 254 if ([string _webkit_hasCaseInsensitivePrefix:@"mailto:"]) { 255 applyHostNameFunctionToMailToURLString(string, f, context); 256 return; 257 } 258 259 // Find the host name in a hierarchical URL. 260 // It comes after a "://" sequence, with scheme characters preceding. 261 // If ends with the end of the string or a ":", "/", or a "?". 262 // If there is a "@" character, the host part is just the part after the "@". 263 NSRange separatorRange = [string rangeOfString:@"://"]; 264 if (separatorRange.location == NSNotFound) { 265 return; 266 } 267 268 // Check that all characters before the :// are valid scheme characters. 269 static NSCharacterSet *nonSchemeCharacters; 270 if (nonSchemeCharacters == nil) { 271 nonSchemeCharacters = [[NSCharacterSet characterSetWithCharactersInString:@"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-."] invertedSet]; 272 CFRetain(nonSchemeCharacters); 273 } 274 if ([string rangeOfCharacterFromSet:nonSchemeCharacters options:0 range:NSMakeRange(0, separatorRange.location)].location != NSNotFound) { 275 return; 276 } 277 278 unsigned stringLength = [string length]; 279 280 static NSCharacterSet *hostTerminators; 281 if (hostTerminators == nil) { 282 hostTerminators = [NSCharacterSet characterSetWithCharactersInString:@":/?#"]; 283 CFRetain(hostTerminators); 284 } 285 286 // Start after the separator. 287 unsigned authorityStart = NSMaxRange(separatorRange); 288 289 // Find terminating character. 290 NSRange hostNameTerminator = [string rangeOfCharacterFromSet:hostTerminators options:0 range:NSMakeRange(authorityStart, stringLength - authorityStart)]; 291 unsigned hostNameEnd = hostNameTerminator.location == NSNotFound ? stringLength : hostNameTerminator.location; 292 293 // Find "@" for the start of the host name. 294 NSRange userInfoTerminator = [string rangeOfString:@"@" options:0 range:NSMakeRange(authorityStart, hostNameEnd - authorityStart)]; 295 unsigned hostNameStart = userInfoTerminator.location == NSNotFound ? authorityStart : NSMaxRange(userInfoTerminator); 296 297 f(string, NSMakeRange(hostNameStart, hostNameEnd - hostNameStart), context); 298} 299 300@implementation NSURL (WebNSURLExtras) 301 302static void collectRangesThatNeedMapping(NSString *string, NSRange range, void *context, BOOL encode) 303{ 304 BOOL needsMapping = encode 305 ? [string _web_hostNameNeedsEncodingWithRange:range] 306 : [string _web_hostNameNeedsDecodingWithRange:range]; 307 if (!needsMapping) { 308 return; 309 } 310 311 NSMutableArray **array = (NSMutableArray **)context; 312 if (*array == nil) { 313 *array = [[NSMutableArray alloc] init]; 314 } 315 316 [*array addObject:[NSValue valueWithRange:range]]; 317} 318 319static void collectRangesThatNeedEncoding(NSString *string, NSRange range, void *context) 320{ 321 return collectRangesThatNeedMapping(string, range, context, YES); 322} 323 324static void collectRangesThatNeedDecoding(NSString *string, NSRange range, void *context) 325{ 326 return collectRangesThatNeedMapping(string, range, context, NO); 327} 328 329static NSString *mapHostNames(NSString *string, BOOL encode) 330{ 331 // Generally, we want to optimize for the case where there is one host name that does not need mapping. 332 333 if (encode && [string canBeConvertedToEncoding:NSASCIIStringEncoding]) 334 return string; 335 336 // Make a list of ranges that actually need mapping. 337 NSMutableArray *hostNameRanges = nil; 338 StringRangeApplierFunction f = encode 339 ? collectRangesThatNeedEncoding 340 : collectRangesThatNeedDecoding; 341 applyHostNameFunctionToURLString(string, f, &hostNameRanges); 342 if (hostNameRanges == nil) 343 return string; 344 345 // Do the mapping. 346 NSMutableString *mutableCopy = [string mutableCopy]; 347 unsigned i = [hostNameRanges count]; 348 while (i-- != 0) { 349 NSRange hostNameRange = [[hostNameRanges objectAtIndex:i] rangeValue]; 350 NSString *mappedHostName = encode 351 ? [string _web_encodeHostNameWithRange:hostNameRange] 352 : [string _web_decodeHostNameWithRange:hostNameRange]; 353 [mutableCopy replaceCharactersInRange:hostNameRange withString:mappedHostName]; 354 } 355 [hostNameRanges release]; 356 return [mutableCopy autorelease]; 357} 358 359+ (NSURL *)_web_URLWithUserTypedString:(NSString *)string relativeToURL:(NSURL *)URL 360{ 361 if (string == nil) { 362 return nil; 363 } 364 string = mapHostNames([string _webkit_stringByTrimmingWhitespace], YES); 365 366 NSData *userTypedData = [string dataUsingEncoding:NSUTF8StringEncoding]; 367 ASSERT(userTypedData); 368 369 const UInt8 *inBytes = static_cast<const UInt8 *>([userTypedData bytes]); 370 int inLength = [userTypedData length]; 371 if (inLength == 0) { 372 return [NSURL URLWithString:@""]; 373 } 374 375 char *outBytes = static_cast<char *>(malloc(inLength * 3)); // large enough to %-escape every character 376 char *p = outBytes; 377 int outLength = 0; 378 int i; 379 for (i = 0; i < inLength; i++) { 380 UInt8 c = inBytes[i]; 381 if (c <= 0x20 || c >= 0x7f) { 382 *p++ = '%'; 383 *p++ = hexDigit(c >> 4); 384 *p++ = hexDigit(c & 0xf); 385 outLength += 3; 386 } 387 else { 388 *p++ = c; 389 outLength++; 390 } 391 } 392 393 NSData *data = [NSData dataWithBytesNoCopy:outBytes length:outLength]; // adopts outBytes 394 return [self _web_URLWithData:data relativeToURL:URL]; 395} 396 397+ (NSURL *)_web_URLWithUserTypedString:(NSString *)string 398{ 399 return [self _web_URLWithUserTypedString:string relativeToURL:nil]; 400} 401 402+ (NSURL *)_web_URLWithDataAsString:(NSString *)string 403{ 404 if (string == nil) { 405 return nil; 406 } 407 return [self _web_URLWithDataAsString:string relativeToURL:nil]; 408} 409 410+ (NSURL *)_web_URLWithDataAsString:(NSString *)string relativeToURL:(NSURL *)baseURL 411{ 412 if (string == nil) { 413 return nil; 414 } 415 string = [string _webkit_stringByTrimmingWhitespace]; 416 NSData *data = [string dataUsingEncoding:NSISOLatin1StringEncoding]; 417 return [self _web_URLWithData:data relativeToURL:baseURL]; 418} 419 420+ (NSURL *)_web_URLWithData:(NSData *)data 421{ 422 return [NSURL _web_URLWithData:data relativeToURL:nil]; 423} 424 425+ (NSURL *)_web_URLWithData:(NSData *)data relativeToURL:(NSURL *)baseURL 426{ 427 if (data == nil) 428 return nil; 429 430 NSURL *result = nil; 431 size_t length = [data length]; 432 if (length > 0) { 433 // work around <rdar://4470771>: CFURLCreateAbsoluteURLWithBytes(.., TRUE) doesn't remove non-path components. 434 baseURL = [baseURL _webkit_URLByRemovingResourceSpecifier]; 435 436 const UInt8 *bytes = static_cast<const UInt8*>([data bytes]); 437 // NOTE: We use UTF-8 here since this encoding is used when computing strings when returning URL components 438 // (e.g calls to NSURL -path). However, this function is not tolerant of illegal UTF-8 sequences, which 439 // could either be a malformed string or bytes in a different encoding, like shift-jis, so we fall back 440 // onto using ISO Latin 1 in those cases. 441 result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingUTF8, (CFURLRef)baseURL, YES)); 442 if (!result) 443 result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingISOLatin1, (CFURLRef)baseURL, YES)); 444 } else 445 result = [NSURL URLWithString:@""]; 446 447 return result; 448} 449 450- (NSData *)_web_originalData 451{ 452 UInt8 *buffer = (UInt8 *)malloc(URL_BYTES_BUFFER_LENGTH); 453 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH); 454 if (bytesFilled == -1) { 455 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0); 456 buffer = (UInt8 *)realloc(buffer, bytesToAllocate); 457 bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate); 458 ASSERT(bytesFilled == bytesToAllocate); 459 } 460 461 // buffer is adopted by the NSData 462 NSData *data = [NSData dataWithBytesNoCopy:buffer length:bytesFilled freeWhenDone:YES]; 463 464 NSURL *baseURL = (NSURL *)CFURLGetBaseURL((CFURLRef)self); 465 if (baseURL) 466 return [[NSURL _web_URLWithData:data relativeToURL:baseURL] _web_originalData]; 467 return data; 468} 469 470- (NSString *)_web_originalDataAsString 471{ 472 return [[[NSString alloc] initWithData:[self _web_originalData] encoding:NSISOLatin1StringEncoding] autorelease]; 473} 474 475static CFStringRef createStringWithEscapedUnsafeCharacters(CFStringRef string) 476{ 477 CFIndex length = CFStringGetLength(string); 478 Vector<UChar, 2048> sourceBuffer(length); 479 CFStringGetCharacters(string, CFRangeMake(0, length), sourceBuffer.data()); 480 481 Vector<UChar, 2048> outBuffer; 482 483 CFIndex i = 0; 484 while (i < length) { 485 UChar32 c; 486 U16_NEXT(sourceBuffer, i, length, c) 487 488 if (isLookalikeCharacter(c)) { 489 uint8_t utf8Buffer[4]; 490 CFIndex offset = 0; 491 UBool failure = false; 492 U8_APPEND(utf8Buffer, offset, 4, c, failure) 493 ASSERT(!failure); 494 495 for (CFIndex j = 0; j < offset; ++j) { 496 outBuffer.append('%'); 497 outBuffer.append(hexDigit(utf8Buffer[j] >> 4)); 498 outBuffer.append(hexDigit(utf8Buffer[j] & 0xf)); 499 } 500 } else { 501 UChar utf16Buffer[2]; 502 CFIndex offset = 0; 503 UBool failure = false; 504 U16_APPEND(utf16Buffer, offset, 2, c, failure) 505 ASSERT(!failure); 506 for (CFIndex j = 0; j < offset; ++j) 507 outBuffer.append(utf16Buffer[j]); 508 } 509 } 510 511 return CFStringCreateWithCharacters(NULL, outBuffer.data(), outBuffer.size()); 512} 513 514- (NSString *)_web_userVisibleString 515{ 516 NSData *data = [self _web_originalData]; 517 const unsigned char *before = static_cast<const unsigned char*>([data bytes]); 518 int length = [data length]; 519 520 bool needsHostNameDecoding = false; 521 522 const unsigned char *p = before; 523 int bufferLength = (length * 3) + 1; 524 char *after = static_cast<char *>(malloc(bufferLength)); // large enough to %-escape every character 525 char *q = after; 526 int i; 527 for (i = 0; i < length; i++) { 528 unsigned char c = p[i]; 529 // unescape escape sequences that indicate bytes greater than 0x7f 530 if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) { 531 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]); 532 if (u > 0x7f) { 533 // unescape 534 *q++ = u; 535 } else { 536 // do not unescape 537 *q++ = p[i]; 538 *q++ = p[i + 1]; 539 *q++ = p[i + 2]; 540 } 541 i += 2; 542 } else { 543 *q++ = c; 544 545 // Check for "xn--" in an efficient, non-case-sensitive, way. 546 if (c == '-' && i >= 3 && !needsHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-') 547 needsHostNameDecoding = true; 548 } 549 } 550 *q = '\0'; 551 552 // Check string to see if it can be converted to display using UTF-8 553 NSString *result = [NSString stringWithUTF8String:after]; 554 if (!result) { 555 // Could not convert to UTF-8. 556 // Convert characters greater than 0x7f to escape sequences. 557 // Shift current string to the end of the buffer 558 // then we will copy back bytes to the start of the buffer 559 // as we convert. 560 int afterlength = q - after; 561 char *p = after + bufferLength - afterlength - 1; 562 memmove(p, after, afterlength + 1); // copies trailing '\0' 563 char *q = after; 564 while (*p) { 565 unsigned char c = *p; 566 if (c > 0x7f) { 567 *q++ = '%'; 568 *q++ = hexDigit(c >> 4); 569 *q++ = hexDigit(c & 0xf); 570 } else { 571 *q++ = *p; 572 } 573 p++; 574 } 575 *q = '\0'; 576 result = [NSString stringWithUTF8String:after]; 577 } 578 579 free(after); 580 581 result = mapHostNames(result, !needsHostNameDecoding); 582 result = [result precomposedStringWithCanonicalMapping]; 583 return WebCFAutorelease(createStringWithEscapedUnsafeCharacters((CFStringRef)result)); 584} 585 586- (BOOL)_web_isEmpty 587{ 588 if (!CFURLGetBaseURL((CFURLRef)self)) 589 return CFURLGetBytes((CFURLRef)self, NULL, 0) == 0; 590 return [[self _web_originalData] length] == 0; 591} 592 593- (const char *)_web_URLCString 594{ 595 NSMutableData *data = [NSMutableData data]; 596 [data appendData:[self _web_originalData]]; 597 [data appendBytes:"\0" length:1]; 598 return (const char *)[data bytes]; 599 } 600 601- (NSURL *)_webkit_canonicalize 602{ 603 NSURLRequest *request = [[NSURLRequest alloc] initWithURL:self]; 604 Class concreteClass = WKNSURLProtocolClassForRequest(request); 605 if (!concreteClass) { 606 [request release]; 607 return self; 608 } 609 610 // This applies NSURL's concept of canonicalization, but not KURL's concept. It would 611 // make sense to apply both, but when we tried that it caused a performance degradation 612 // (see 5315926). It might make sense to apply only the KURL concept and not the NSURL 613 // concept, but it's too risky to make that change for WebKit 3.0. 614 NSURLRequest *newRequest = [concreteClass canonicalRequestForRequest:request]; 615 NSURL *newURL = [newRequest URL]; 616 NSURL *result = [[newURL retain] autorelease]; 617 [request release]; 618 619 return result; 620} 621 622typedef struct { 623 NSString *scheme; 624 NSString *user; 625 NSString *password; 626 NSString *host; 627 CFIndex port; // kCFNotFound means ignore/omit 628 NSString *path; 629 NSString *query; 630 NSString *fragment; 631} WebKitURLComponents; 632 633- (NSURL *)_webkit_URLByRemovingComponent:(CFURLComponentType)component 634{ 635 CFRange fragRg = CFURLGetByteRangeForComponent((CFURLRef)self, component, NULL); 636 // Check to see if a fragment exists before decomposing the URL. 637 if (fragRg.location == kCFNotFound) 638 return self; 639 640 UInt8 *urlBytes, buffer[2048]; 641 CFIndex numBytes = CFURLGetBytes((CFURLRef)self, buffer, 2048); 642 if (numBytes == -1) { 643 numBytes = CFURLGetBytes((CFURLRef)self, NULL, 0); 644 urlBytes = static_cast<UInt8*>(malloc(numBytes)); 645 CFURLGetBytes((CFURLRef)self, urlBytes, numBytes); 646 } else 647 urlBytes = buffer; 648 649 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingUTF8, NULL)); 650 if (!result) 651 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingISOLatin1, NULL)); 652 653 if (urlBytes != buffer) free(urlBytes); 654 return result ? [result autorelease] : self; 655} 656 657- (NSURL *)_webkit_URLByRemovingFragment 658{ 659 return [self _webkit_URLByRemovingComponent:kCFURLComponentFragment]; 660} 661 662- (NSURL *)_webkit_URLByRemovingResourceSpecifier 663{ 664 return [self _webkit_URLByRemovingComponent:kCFURLComponentResourceSpecifier]; 665} 666 667- (BOOL)_webkit_isJavaScriptURL 668{ 669 return [[self _web_originalDataAsString] _webkit_isJavaScriptURL]; 670} 671 672- (NSString *)_webkit_scriptIfJavaScriptURL 673{ 674 return [[self absoluteString] _webkit_scriptIfJavaScriptURL]; 675} 676 677- (BOOL)_webkit_isFileURL 678{ 679 return [[self _web_originalDataAsString] _webkit_isFileURL]; 680} 681 682- (BOOL)_webkit_isFTPDirectoryURL 683{ 684 return [[self _web_originalDataAsString] _webkit_isFTPDirectoryURL]; 685} 686 687- (BOOL)_webkit_shouldLoadAsEmptyDocument 688{ 689 return [[self _web_originalDataAsString] _webkit_hasCaseInsensitivePrefix:@"about:"] || [self _web_isEmpty]; 690} 691 692- (NSURL *)_web_URLWithLowercasedScheme 693{ 694 CFRange range; 695 CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &range); 696 if (range.location == kCFNotFound) { 697 return self; 698 } 699 700 UInt8 static_buffer[URL_BYTES_BUFFER_LENGTH]; 701 UInt8 *buffer = static_buffer; 702 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH); 703 if (bytesFilled == -1) { 704 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0); 705 buffer = static_cast<UInt8 *>(malloc(bytesToAllocate)); 706 bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate); 707 ASSERT(bytesFilled == bytesToAllocate); 708 } 709 710 int i; 711 BOOL changed = NO; 712 for (i = 0; i < range.length; ++i) { 713 char c = buffer[range.location + i]; 714 char lower = toASCIILower(c); 715 if (c != lower) { 716 buffer[range.location + i] = lower; 717 changed = YES; 718 } 719 } 720 721 NSURL *result = changed 722 ? (NSURL *)WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, buffer, bytesFilled, kCFStringEncodingUTF8, nil, YES)) 723 : (NSURL *)self; 724 725 if (buffer != static_buffer) { 726 free(buffer); 727 } 728 729 return result; 730} 731 732 733-(BOOL)_web_hasQuestionMarkOnlyQueryString 734{ 735 CFRange rangeWithSeparators; 736 CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentQuery, &rangeWithSeparators); 737 if (rangeWithSeparators.location != kCFNotFound && rangeWithSeparators.length == 1) { 738 return YES; 739 } 740 return NO; 741} 742 743-(NSData *)_web_schemeSeparatorWithoutColon 744{ 745 NSData *result = nil; 746 CFRange rangeWithSeparators; 747 CFRange range = CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &rangeWithSeparators); 748 if (rangeWithSeparators.location != kCFNotFound) { 749 NSString *absoluteString = [self absoluteString]; 750 NSRange separatorsRange = NSMakeRange(range.location + range.length + 1, rangeWithSeparators.length - range.length - 1); 751 if (separatorsRange.location + separatorsRange.length <= [absoluteString length]) { 752 NSString *slashes = [absoluteString substringWithRange:separatorsRange]; 753 result = [slashes dataUsingEncoding:NSISOLatin1StringEncoding]; 754 } 755 } 756 return result; 757} 758 759#define completeURL (CFURLComponentType)-1 760 761-(NSData *)_web_dataForURLComponentType:(CFURLComponentType)componentType 762{ 763 static int URLComponentTypeBufferLength = 2048; 764 765 UInt8 staticAllBytesBuffer[URLComponentTypeBufferLength]; 766 UInt8 *allBytesBuffer = staticAllBytesBuffer; 767 768 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, URLComponentTypeBufferLength); 769 if (bytesFilled == -1) { 770 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0); 771 allBytesBuffer = static_cast<UInt8 *>(malloc(bytesToAllocate)); 772 bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, bytesToAllocate); 773 } 774 775 CFRange range; 776 if (componentType != completeURL) { 777 range = CFURLGetByteRangeForComponent((CFURLRef)self, componentType, NULL); 778 if (range.location == kCFNotFound) { 779 return nil; 780 } 781 } 782 else { 783 range.location = 0; 784 range.length = bytesFilled; 785 } 786 787 NSData *componentData = [NSData dataWithBytes:allBytesBuffer + range.location length:range.length]; 788 789 const unsigned char *bytes = static_cast<const unsigned char *>([componentData bytes]); 790 NSMutableData *resultData = [NSMutableData data]; 791 // NOTE: add leading '?' to query strings non-zero length query strings. 792 // NOTE: retain question-mark only query strings. 793 if (componentType == kCFURLComponentQuery) { 794 if (range.length > 0 || [self _web_hasQuestionMarkOnlyQueryString]) { 795 [resultData appendBytes:"?" length:1]; 796 } 797 } 798 int i; 799 for (i = 0; i < range.length; i++) { 800 unsigned char c = bytes[i]; 801 if (c <= 0x20 || c >= 0x7f) { 802 char escaped[3]; 803 escaped[0] = '%'; 804 escaped[1] = hexDigit(c >> 4); 805 escaped[2] = hexDigit(c & 0xf); 806 [resultData appendBytes:escaped length:3]; 807 } 808 else { 809 char b[1]; 810 b[0] = c; 811 [resultData appendBytes:b length:1]; 812 } 813 } 814 815 if (staticAllBytesBuffer != allBytesBuffer) { 816 free(allBytesBuffer); 817 } 818 819 return resultData; 820} 821 822-(NSData *)_web_schemeData 823{ 824 return [self _web_dataForURLComponentType:kCFURLComponentScheme]; 825} 826 827-(NSData *)_web_hostData 828{ 829 NSData *result = [self _web_dataForURLComponentType:kCFURLComponentHost]; 830 NSData *scheme = [self _web_schemeData]; 831 // Take off localhost for file 832 if ([scheme _web_isCaseInsensitiveEqualToCString:"file"]) { 833 return ([result _web_isCaseInsensitiveEqualToCString:"localhost"]) ? nil : result; 834 } 835 return result; 836} 837 838- (NSString *)_web_hostString 839{ 840 NSData *data = [self _web_hostData]; 841 if (!data) { 842 data = [NSData data]; 843 } 844 return [[[NSString alloc] initWithData:[self _web_hostData] encoding:NSUTF8StringEncoding] autorelease]; 845} 846 847- (NSString *)_webkit_suggestedFilenameWithMIMEType:(NSString *)MIMEType 848{ 849 return suggestedFilenameWithMIMEType(self, MIMEType); 850} 851 852@end 853 854@implementation NSString (WebNSURLExtras) 855 856- (BOOL)_web_isUserVisibleURL 857{ 858 BOOL valid = YES; 859 // get buffer 860 861 char static_buffer[1024]; 862 const char *p; 863 BOOL success = CFStringGetCString((CFStringRef)self, static_buffer, 1023, kCFStringEncodingUTF8); 864 if (success) { 865 p = static_buffer; 866 } else { 867 p = [self UTF8String]; 868 } 869 870 int length = strlen(p); 871 872 // check for characters <= 0x20 or >=0x7f, %-escape sequences of %7f, and xn--, these 873 // are the things that will lead _web_userVisibleString to actually change things. 874 int i; 875 for (i = 0; i < length; i++) { 876 unsigned char c = p[i]; 877 // escape control characters, space, and delete 878 if (c <= 0x20 || c == 0x7f) { 879 valid = NO; 880 break; 881 } else if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) { 882 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]); 883 if (u > 0x7f) { 884 valid = NO; 885 break; 886 } 887 i += 2; 888 } else { 889 // Check for "xn--" in an efficient, non-case-sensitive, way. 890 if (c == '-' && i >= 3 && (p[i - 3] | 0x20) == 'x' && (p[i - 2] | 0x20) == 'n' && p[i - 1] == '-') { 891 valid = NO; 892 break; 893 } 894 } 895 } 896 897 return valid; 898} 899 900 901- (BOOL)_webkit_isJavaScriptURL 902{ 903 return [self _webkit_hasCaseInsensitivePrefix:@"javascript:"]; 904} 905 906- (BOOL)_webkit_isFileURL 907{ 908 return [self rangeOfString:@"file:" options:(NSCaseInsensitiveSearch | NSAnchoredSearch)].location != NSNotFound; 909} 910 911- (NSString *)_webkit_stringByReplacingValidPercentEscapes 912{ 913 return decodeURLEscapeSequences(self); 914} 915 916- (NSString *)_webkit_scriptIfJavaScriptURL 917{ 918 if (![self _webkit_isJavaScriptURL]) { 919 return nil; 920 } 921 return [[self substringFromIndex:11] _webkit_stringByReplacingValidPercentEscapes]; 922} 923 924- (BOOL)_webkit_isFTPDirectoryURL 925{ 926 int length = [self length]; 927 if (length < 5) { // 5 is length of "ftp:/" 928 return NO; 929 } 930 unichar lastChar = [self characterAtIndex:length - 1]; 931 return lastChar == '/' && [self _webkit_hasCaseInsensitivePrefix:@"ftp:"]; 932} 933 934 935static BOOL readIDNScriptWhiteListFile(NSString *filename) 936{ 937 if (!filename) { 938 return NO; 939 } 940 FILE *file = fopen([filename fileSystemRepresentation], "r"); 941 if (file == NULL) { 942 return NO; 943 } 944 945 // Read a word at a time. 946 // Allow comments, starting with # character to the end of the line. 947 while (1) { 948 // Skip a comment if present. 949 int result = fscanf(file, " #%*[^\n\r]%*[\n\r]"); 950 if (result == EOF) { 951 break; 952 } 953 954 // Read a script name if present. 955 char word[33]; 956 result = fscanf(file, " %32[^# \t\n\r]%*[^# \t\n\r] ", word); 957 if (result == EOF) { 958 break; 959 } 960 if (result == 1) { 961 // Got a word, map to script code and put it into the array. 962 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); 963 if (script >= 0 && script < USCRIPT_CODE_LIMIT) { 964 size_t index = script / 32; 965 uint32_t mask = 1 << (script % 32); 966 IDNScriptWhiteList[index] |= mask; 967 } 968 } 969 } 970 fclose(file); 971 return YES; 972} 973 974static void readIDNScriptWhiteList(void) 975{ 976 // Read white list from library. 977 NSArray *dirs = NSSearchPathForDirectoriesInDomains(NSLibraryDirectory, NSAllDomainsMask, YES); 978 int i, numDirs = [dirs count]; 979 for (i = 0; i < numDirs; i++) { 980 NSString *dir = [dirs objectAtIndex:i]; 981 if (readIDNScriptWhiteListFile([dir stringByAppendingPathComponent:@"IDNScriptWhiteList.txt"])) { 982 return; 983 } 984 } 985 986 // Fall back on white list inside bundle. 987 NSBundle *bundle = [NSBundle bundleWithIdentifier:@"com.apple.WebKit"]; 988 readIDNScriptWhiteListFile([bundle pathForResource:@"IDNScriptWhiteList" ofType:@"txt"]); 989} 990 991static BOOL allCharactersInIDNScriptWhiteList(const UChar *buffer, int32_t length) 992{ 993 pthread_once(&IDNScriptWhiteListFileRead, readIDNScriptWhiteList); 994 995 int32_t i = 0; 996 while (i < length) { 997 UChar32 c; 998 U16_NEXT(buffer, i, length, c) 999 UErrorCode error = U_ZERO_ERROR; 1000 UScriptCode script = uscript_getScript(c, &error); 1001 if (error != U_ZERO_ERROR) { 1002 LOG_ERROR("got ICU error while trying to look at scripts: %d", error); 1003 return NO; 1004 } 1005 if (script < 0) { 1006 LOG_ERROR("got negative number for script code from ICU: %d", script); 1007 return NO; 1008 } 1009 if (script >= USCRIPT_CODE_LIMIT) { 1010 return NO; 1011 } 1012 size_t index = script / 32; 1013 uint32_t mask = 1 << (script % 32); 1014 if (!(IDNScriptWhiteList[index] & mask)) { 1015 return NO; 1016 } 1017 1018 if (isLookalikeCharacter(c)) 1019 return NO; 1020 } 1021 return YES; 1022} 1023 1024// Return value of nil means no mapping is necessary. 1025// If makeString is NO, then return value is either nil or self to indicate mapping is necessary. 1026// If makeString is YES, then return value is either nil or the mapped string. 1027- (NSString *)_web_mapHostNameWithRange:(NSRange)range encode:(BOOL)encode makeString:(BOOL)makeString 1028{ 1029 if (range.length > HOST_NAME_BUFFER_LENGTH) { 1030 return nil; 1031 } 1032 1033 if ([self length] == 0) 1034 return nil; 1035 1036 UChar sourceBuffer[HOST_NAME_BUFFER_LENGTH]; 1037 UChar destinationBuffer[HOST_NAME_BUFFER_LENGTH]; 1038 1039 NSString *string = self; 1040 if (encode && [self rangeOfString:@"%" options:NSLiteralSearch range:range].location != NSNotFound) { 1041 NSString *substring = [self substringWithRange:range]; 1042 substring = WebCFAutorelease(CFURLCreateStringByReplacingPercentEscapes(NULL, (CFStringRef)substring, CFSTR(""))); 1043 if (substring != nil) { 1044 string = substring; 1045 range = NSMakeRange(0, [string length]); 1046 } 1047 } 1048 1049 int length = range.length; 1050 [string getCharacters:sourceBuffer range:range]; 1051 1052 UErrorCode error = U_ZERO_ERROR; 1053 int32_t numCharactersConverted = (encode ? uidna_IDNToASCII : uidna_IDNToUnicode) 1054 (sourceBuffer, length, destinationBuffer, HOST_NAME_BUFFER_LENGTH, UIDNA_ALLOW_UNASSIGNED, NULL, &error); 1055 if (error != U_ZERO_ERROR) { 1056 return nil; 1057 } 1058 if (numCharactersConverted == length && memcmp(sourceBuffer, destinationBuffer, length * sizeof(UChar)) == 0) { 1059 return nil; 1060 } 1061 if (!encode && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted)) { 1062 return nil; 1063 } 1064 return makeString ? (NSString *)[NSString stringWithCharacters:destinationBuffer length:numCharactersConverted] : (NSString *)self; 1065} 1066 1067- (BOOL)_web_hostNameNeedsDecodingWithRange:(NSRange)range 1068{ 1069 return [self _web_mapHostNameWithRange:range encode:NO makeString:NO] != nil; 1070} 1071 1072- (BOOL)_web_hostNameNeedsEncodingWithRange:(NSRange)range 1073{ 1074 return [self _web_mapHostNameWithRange:range encode:YES makeString:NO] != nil; 1075} 1076 1077- (NSString *)_web_decodeHostNameWithRange:(NSRange)range 1078{ 1079 return [self _web_mapHostNameWithRange:range encode:NO makeString:YES]; 1080} 1081 1082- (NSString *)_web_encodeHostNameWithRange:(NSRange)range 1083{ 1084 return [self _web_mapHostNameWithRange:range encode:YES makeString:YES]; 1085} 1086 1087- (NSString *)_web_decodeHostName 1088{ 1089 NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:NO makeString:YES]; 1090 return name == nil ? self : name; 1091} 1092 1093- (NSString *)_web_encodeHostName 1094{ 1095 NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:YES makeString:YES]; 1096 return name == nil ? self : name; 1097} 1098 1099-(NSRange)_webkit_rangeOfURLScheme 1100{ 1101 NSRange colon = [self rangeOfString:@":"]; 1102 if (colon.location != NSNotFound && colon.location > 0) { 1103 NSRange scheme = {0, colon.location}; 1104 static NSCharacterSet *InverseSchemeCharacterSet = nil; 1105 if (!InverseSchemeCharacterSet) { 1106 /* 1107 This stuff is very expensive. 10-15 msec on a 2x1.2GHz. If not cached it swamps 1108 everything else when adding items to the autocomplete DB. Makes me wonder if we 1109 even need to enforce the character set here. 1110 */ 1111 NSString *acceptableCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; 1112 InverseSchemeCharacterSet = [[[NSCharacterSet characterSetWithCharactersInString:acceptableCharacters] invertedSet] retain]; 1113 } 1114 NSRange illegals = [self rangeOfCharacterFromSet:InverseSchemeCharacterSet options:0 range:scheme]; 1115 if (illegals.location == NSNotFound) 1116 return scheme; 1117 } 1118 return NSMakeRange(NSNotFound, 0); 1119} 1120 1121-(BOOL)_webkit_looksLikeAbsoluteURL 1122{ 1123 // Trim whitespace because _web_URLWithString allows whitespace. 1124 return [[self _webkit_stringByTrimmingWhitespace] _webkit_rangeOfURLScheme].location != NSNotFound; 1125} 1126 1127- (NSString *)_webkit_URLFragment 1128{ 1129 NSRange fragmentRange; 1130 1131 fragmentRange = [self rangeOfString:@"#" options:NSLiteralSearch]; 1132 if (fragmentRange.location == NSNotFound) 1133 return nil; 1134 return [self substringFromIndex:fragmentRange.location + 1]; 1135} 1136 1137@end 1138