1/* 2 * Copyright (C) 2005, 2007, 2008 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov (ap@nypop.com) 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of 15 * its contributors may be used to endorse or promote products derived 16 * from this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30#import "WebNSURLExtras.h" 31 32#import "WebKitNSStringExtras.h" 33#import "WebLocalizableStrings.h" 34#import "WebNSDataExtras.h" 35#import "WebNSObjectExtras.h" 36#import "WebSystemInterface.h" 37#import <Foundation/NSURLRequest.h> 38#import <WebCore/KURL.h> 39#import <WebCore/LoaderNSURLExtras.h> 40#import <WebKitSystemInterface.h> 41#import <wtf/Assertions.h> 42#import <unicode/uchar.h> 43#import <unicode/uidna.h> 44#import <unicode/uscript.h> 45 46using namespace WebCore; 47using namespace WTF; 48 49typedef void (* StringRangeApplierFunction)(NSString *string, NSRange range, void *context); 50 51// Needs to be big enough to hold an IDN-encoded name. 52// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. 53#define HOST_NAME_BUFFER_LENGTH 2048 54 55#define URL_BYTES_BUFFER_LENGTH 2048 56 57static pthread_once_t IDNScriptWhiteListFileRead = PTHREAD_ONCE_INIT; 58static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32]; 59 60static inline BOOL isLookalikeCharacter(int charCode) 61{ 62// FIXME: Move this code down into WebCore so it can be shared with other platforms. 63 64// This function treats the following as unsafe, lookalike characters: 65// any non-printable character, any character considered as whitespace that isn't already converted to a space by ICU, 66// and any ignorable character. 67 68// We also considered the characters in Mozilla's blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars), 69// and included all of these characters that ICU can encode. 70 71 if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) 72 return YES; 73 74 switch (charCode) { 75 case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */ 76 case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */ 77 case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */ 78 case 0x05B4: /* HEBREW POINT HIRIQ */ 79 case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */ 80 case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */ 81 case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */ 82 case 0x0660: /* ARABIC INDIC DIGIT ZERO */ 83 case 0x06D4: /* ARABIC FULL STOP */ 84 case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */ 85 case 0x2027: /* HYPHENATION POINT */ 86 case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ 87 case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ 88 case 0x2044: /* FRACTION SLASH */ 89 case 0x2215: /* DIVISION SLASH */ 90 case 0x23ae: /* INTEGRAL EXTENSION */ 91 case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */ 92 case 0x29F8: /* BIG SOLIDUS */ 93 case 0x29f6: /* SOLIDUS WITH OVERBAR */ 94 case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */ 95 case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */ 96 case 0x3008: /* LEFT ANGLE BRACKET */ 97 case 0x3014: /* LEFT TORTOISE SHELL BRACKET */ 98 case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */ 99 case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */ 100 case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */ 101 case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */ 102 case 0x33DF: /* SQUARE A OVER M */ 103 case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */ 104 case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ 105 case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */ 106 case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */ 107 case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */ 108 return YES; 109 default: 110 return NO; 111 } 112} 113 114static char hexDigit(int i) 115{ 116 if (i < 0 || i > 16) { 117 LOG_ERROR("illegal hex digit"); 118 return '0'; 119 } 120 int h = i; 121 if (h >= 10) { 122 h = h - 10 + 'A'; 123 } 124 else { 125 h += '0'; 126 } 127 return h; 128} 129 130static BOOL isHexDigit(char c) 131{ 132 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); 133} 134 135static int hexDigitValue(char c) 136{ 137 if (c >= '0' && c <= '9') { 138 return c - '0'; 139 } 140 if (c >= 'A' && c <= 'F') { 141 return c - 'A' + 10; 142 } 143 if (c >= 'a' && c <= 'f') { 144 return c - 'a' + 10; 145 } 146 LOG_ERROR("illegal hex digit"); 147 return 0; 148} 149 150static void applyHostNameFunctionToMailToURLString(NSString *string, StringRangeApplierFunction f, void *context) 151{ 152 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character. 153 // Skip quoted strings so that characters in them don't confuse us. 154 // When we find a '?' character, we are past the part of the URL that contains host names. 155 156 static NSCharacterSet *hostNameOrStringStartCharacters; 157 if (hostNameOrStringStartCharacters == nil) { 158 hostNameOrStringStartCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"@?"]; 159 CFRetain(hostNameOrStringStartCharacters); 160 } 161 static NSCharacterSet *hostNameEndCharacters; 162 if (hostNameEndCharacters == nil) { 163 hostNameEndCharacters = [NSCharacterSet characterSetWithCharactersInString:@">,?"]; 164 CFRetain(hostNameEndCharacters); 165 } 166 static NSCharacterSet *quotedStringCharacters; 167 if (quotedStringCharacters == nil) { 168 quotedStringCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"\\"]; 169 CFRetain(quotedStringCharacters); 170 } 171 172 unsigned stringLength = [string length]; 173 NSRange remaining = NSMakeRange(0, stringLength); 174 175 while (1) { 176 // Find start of host name or of quoted string. 177 NSRange hostNameOrStringStart = [string rangeOfCharacterFromSet:hostNameOrStringStartCharacters options:0 range:remaining]; 178 if (hostNameOrStringStart.location == NSNotFound) { 179 return; 180 } 181 unichar c = [string characterAtIndex:hostNameOrStringStart.location]; 182 remaining.location = NSMaxRange(hostNameOrStringStart); 183 remaining.length = stringLength - remaining.location; 184 185 if (c == '?') { 186 return; 187 } 188 189 if (c == '@') { 190 // Find end of host name. 191 unsigned hostNameStart = remaining.location; 192 NSRange hostNameEnd = [string rangeOfCharacterFromSet:hostNameEndCharacters options:0 range:remaining]; 193 BOOL done; 194 if (hostNameEnd.location == NSNotFound) { 195 hostNameEnd.location = stringLength; 196 done = YES; 197 } else { 198 remaining.location = hostNameEnd.location; 199 remaining.length = stringLength - remaining.location; 200 done = NO; 201 } 202 203 // Process host name range. 204 f(string, NSMakeRange(hostNameStart, hostNameEnd.location - hostNameStart), context); 205 206 if (done) { 207 return; 208 } 209 } else { 210 // Skip quoted string. 211 ASSERT(c == '"'); 212 while (1) { 213 NSRange escapedCharacterOrStringEnd = [string rangeOfCharacterFromSet:quotedStringCharacters options:0 range:remaining]; 214 if (escapedCharacterOrStringEnd.location == NSNotFound) { 215 return; 216 } 217 c = [string characterAtIndex:escapedCharacterOrStringEnd.location]; 218 remaining.location = NSMaxRange(escapedCharacterOrStringEnd); 219 remaining.length = stringLength - remaining.location; 220 221 // If we are the end of the string, then break from the string loop back to the host name loop. 222 if (c == '"') { 223 break; 224 } 225 226 // Skip escaped character. 227 ASSERT(c == '\\'); 228 if (remaining.length == 0) { 229 return; 230 } 231 remaining.location += 1; 232 remaining.length -= 1; 233 } 234 } 235 } 236} 237 238static void applyHostNameFunctionToURLString(NSString *string, StringRangeApplierFunction f, void *context) 239{ 240 // Find hostnames. Too bad we can't use any real URL-parsing code to do this, 241 // but we have to do it before doing all the %-escaping, and this is the only 242 // code we have that parses mailto URLs anyway. 243 244 // Maybe we should implement this using a character buffer instead? 245 246 if ([string _webkit_hasCaseInsensitivePrefix:@"mailto:"]) { 247 applyHostNameFunctionToMailToURLString(string, f, context); 248 return; 249 } 250 251 // Find the host name in a hierarchical URL. 252 // It comes after a "://" sequence, with scheme characters preceding. 253 // If ends with the end of the string or a ":", "/", or a "?". 254 // If there is a "@" character, the host part is just the part after the "@". 255 NSRange separatorRange = [string rangeOfString:@"://"]; 256 if (separatorRange.location == NSNotFound) { 257 return; 258 } 259 260 // Check that all characters before the :// are valid scheme characters. 261 static NSCharacterSet *nonSchemeCharacters; 262 if (nonSchemeCharacters == nil) { 263 nonSchemeCharacters = [[NSCharacterSet characterSetWithCharactersInString:@"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-."] invertedSet]; 264 CFRetain(nonSchemeCharacters); 265 } 266 if ([string rangeOfCharacterFromSet:nonSchemeCharacters options:0 range:NSMakeRange(0, separatorRange.location)].location != NSNotFound) { 267 return; 268 } 269 270 unsigned stringLength = [string length]; 271 272 static NSCharacterSet *hostTerminators; 273 if (hostTerminators == nil) { 274 hostTerminators = [NSCharacterSet characterSetWithCharactersInString:@":/?#"]; 275 CFRetain(hostTerminators); 276 } 277 278 // Start after the separator. 279 unsigned authorityStart = NSMaxRange(separatorRange); 280 281 // Find terminating character. 282 NSRange hostNameTerminator = [string rangeOfCharacterFromSet:hostTerminators options:0 range:NSMakeRange(authorityStart, stringLength - authorityStart)]; 283 unsigned hostNameEnd = hostNameTerminator.location == NSNotFound ? stringLength : hostNameTerminator.location; 284 285 // Find "@" for the start of the host name. 286 NSRange userInfoTerminator = [string rangeOfString:@"@" options:0 range:NSMakeRange(authorityStart, hostNameEnd - authorityStart)]; 287 unsigned hostNameStart = userInfoTerminator.location == NSNotFound ? authorityStart : NSMaxRange(userInfoTerminator); 288 289 f(string, NSMakeRange(hostNameStart, hostNameEnd - hostNameStart), context); 290} 291 292@implementation NSURL (WebNSURLExtras) 293 294static void collectRangesThatNeedMapping(NSString *string, NSRange range, void *context, BOOL encode) 295{ 296 BOOL needsMapping = encode 297 ? [string _web_hostNameNeedsEncodingWithRange:range] 298 : [string _web_hostNameNeedsDecodingWithRange:range]; 299 if (!needsMapping) { 300 return; 301 } 302 303 NSMutableArray **array = (NSMutableArray **)context; 304 if (*array == nil) { 305 *array = [[NSMutableArray alloc] init]; 306 } 307 308 [*array addObject:[NSValue valueWithRange:range]]; 309} 310 311static void collectRangesThatNeedEncoding(NSString *string, NSRange range, void *context) 312{ 313 return collectRangesThatNeedMapping(string, range, context, YES); 314} 315 316static void collectRangesThatNeedDecoding(NSString *string, NSRange range, void *context) 317{ 318 return collectRangesThatNeedMapping(string, range, context, NO); 319} 320 321static NSString *mapHostNames(NSString *string, BOOL encode) 322{ 323 // Generally, we want to optimize for the case where there is one host name that does not need mapping. 324 325 if (encode && [string canBeConvertedToEncoding:NSASCIIStringEncoding]) 326 return string; 327 328 // Make a list of ranges that actually need mapping. 329 NSMutableArray *hostNameRanges = nil; 330 StringRangeApplierFunction f = encode 331 ? collectRangesThatNeedEncoding 332 : collectRangesThatNeedDecoding; 333 applyHostNameFunctionToURLString(string, f, &hostNameRanges); 334 if (hostNameRanges == nil) 335 return string; 336 337 // Do the mapping. 338 NSMutableString *mutableCopy = [string mutableCopy]; 339 unsigned i = [hostNameRanges count]; 340 while (i-- != 0) { 341 NSRange hostNameRange = [[hostNameRanges objectAtIndex:i] rangeValue]; 342 NSString *mappedHostName = encode 343 ? [string _web_encodeHostNameWithRange:hostNameRange] 344 : [string _web_decodeHostNameWithRange:hostNameRange]; 345 [mutableCopy replaceCharactersInRange:hostNameRange withString:mappedHostName]; 346 } 347 [hostNameRanges release]; 348 return [mutableCopy autorelease]; 349} 350 351+ (NSURL *)_web_URLWithUserTypedString:(NSString *)string relativeToURL:(NSURL *)URL 352{ 353 if (string == nil) { 354 return nil; 355 } 356 string = mapHostNames([string _webkit_stringByTrimmingWhitespace], YES); 357 358 NSData *userTypedData = [string dataUsingEncoding:NSUTF8StringEncoding]; 359 ASSERT(userTypedData); 360 361 const UInt8 *inBytes = static_cast<const UInt8 *>([userTypedData bytes]); 362 int inLength = [userTypedData length]; 363 if (inLength == 0) { 364 return [NSURL URLWithString:@""]; 365 } 366 367 char *outBytes = static_cast<char *>(malloc(inLength * 3)); // large enough to %-escape every character 368 char *p = outBytes; 369 int outLength = 0; 370 int i; 371 for (i = 0; i < inLength; i++) { 372 UInt8 c = inBytes[i]; 373 if (c <= 0x20 || c >= 0x7f) { 374 *p++ = '%'; 375 *p++ = hexDigit(c >> 4); 376 *p++ = hexDigit(c & 0xf); 377 outLength += 3; 378 } 379 else { 380 *p++ = c; 381 outLength++; 382 } 383 } 384 385 NSData *data = [NSData dataWithBytesNoCopy:outBytes length:outLength]; // adopts outBytes 386 return [self _web_URLWithData:data relativeToURL:URL]; 387} 388 389+ (NSURL *)_web_URLWithUserTypedString:(NSString *)string 390{ 391 return [self _web_URLWithUserTypedString:string relativeToURL:nil]; 392} 393 394+ (NSURL *)_web_URLWithDataAsString:(NSString *)string 395{ 396 if (string == nil) { 397 return nil; 398 } 399 return [self _web_URLWithDataAsString:string relativeToURL:nil]; 400} 401 402+ (NSURL *)_web_URLWithDataAsString:(NSString *)string relativeToURL:(NSURL *)baseURL 403{ 404 if (string == nil) { 405 return nil; 406 } 407 string = [string _webkit_stringByTrimmingWhitespace]; 408 NSData *data = [string dataUsingEncoding:NSISOLatin1StringEncoding]; 409 return [self _web_URLWithData:data relativeToURL:baseURL]; 410} 411 412+ (NSURL *)_web_URLWithData:(NSData *)data 413{ 414 return [NSURL _web_URLWithData:data relativeToURL:nil]; 415} 416 417+ (NSURL *)_web_URLWithData:(NSData *)data relativeToURL:(NSURL *)baseURL 418{ 419 if (data == nil) 420 return nil; 421 422 NSURL *result = nil; 423 size_t length = [data length]; 424 if (length > 0) { 425 // work around <rdar://4470771>: CFURLCreateAbsoluteURLWithBytes(.., TRUE) doesn't remove non-path components. 426 baseURL = [baseURL _webkit_URLByRemovingResourceSpecifier]; 427 428 const UInt8 *bytes = static_cast<const UInt8*>([data bytes]); 429 // NOTE: We use UTF-8 here since this encoding is used when computing strings when returning URL components 430 // (e.g calls to NSURL -path). However, this function is not tolerant of illegal UTF-8 sequences, which 431 // could either be a malformed string or bytes in a different encoding, like shift-jis, so we fall back 432 // onto using ISO Latin 1 in those cases. 433 result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingUTF8, (CFURLRef)baseURL, YES)); 434 if (!result) 435 result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingISOLatin1, (CFURLRef)baseURL, YES)); 436 } else 437 result = [NSURL URLWithString:@""]; 438 439 return result; 440} 441 442- (NSData *)_web_originalData 443{ 444 UInt8 *buffer = (UInt8 *)malloc(URL_BYTES_BUFFER_LENGTH); 445 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH); 446 if (bytesFilled == -1) { 447 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0); 448 buffer = (UInt8 *)realloc(buffer, bytesToAllocate); 449 bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate); 450 ASSERT(bytesFilled == bytesToAllocate); 451 } 452 453 // buffer is adopted by the NSData 454 NSData *data = [NSData dataWithBytesNoCopy:buffer length:bytesFilled freeWhenDone:YES]; 455 456 NSURL *baseURL = (NSURL *)CFURLGetBaseURL((CFURLRef)self); 457 if (baseURL) 458 return [[NSURL _web_URLWithData:data relativeToURL:baseURL] _web_originalData]; 459 return data; 460} 461 462- (NSString *)_web_originalDataAsString 463{ 464 return [[[NSString alloc] initWithData:[self _web_originalData] encoding:NSISOLatin1StringEncoding] autorelease]; 465} 466 467static CFStringRef createStringWithEscapedUnsafeCharacters(CFStringRef string) 468{ 469 CFIndex length = CFStringGetLength(string); 470 Vector<UChar, 2048> sourceBuffer(length); 471 CFStringGetCharacters(string, CFRangeMake(0, length), sourceBuffer.data()); 472 473 Vector<UChar, 2048> outBuffer; 474 475 CFIndex i = 0; 476 while (i < length) { 477 UChar32 c; 478 U16_NEXT(sourceBuffer, i, length, c) 479 480 if (isLookalikeCharacter(c)) { 481 uint8_t utf8Buffer[4]; 482 CFIndex offset = 0; 483 UBool failure = false; 484 U8_APPEND(utf8Buffer, offset, 4, c, failure) 485 ASSERT(!failure); 486 487 for (CFIndex j = 0; j < offset; ++j) { 488 outBuffer.append('%'); 489 outBuffer.append(hexDigit(utf8Buffer[j] >> 4)); 490 outBuffer.append(hexDigit(utf8Buffer[j] & 0xf)); 491 } 492 } else { 493 UChar utf16Buffer[2]; 494 CFIndex offset = 0; 495 UBool failure = false; 496 U16_APPEND(utf16Buffer, offset, 2, c, failure) 497 ASSERT(!failure); 498 for (CFIndex j = 0; j < offset; ++j) 499 outBuffer.append(utf16Buffer[j]); 500 } 501 } 502 503 return CFStringCreateWithCharacters(NULL, outBuffer.data(), outBuffer.size()); 504} 505 506- (NSString *)_web_userVisibleString 507{ 508 NSData *data = [self _web_originalData]; 509 const unsigned char *before = static_cast<const unsigned char*>([data bytes]); 510 int length = [data length]; 511 512 bool needsHostNameDecoding = false; 513 514 const unsigned char *p = before; 515 int bufferLength = (length * 3) + 1; 516 char *after = static_cast<char *>(malloc(bufferLength)); // large enough to %-escape every character 517 char *q = after; 518 int i; 519 for (i = 0; i < length; i++) { 520 unsigned char c = p[i]; 521 // unescape escape sequences that indicate bytes greater than 0x7f 522 if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) { 523 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]); 524 if (u > 0x7f) { 525 // unescape 526 *q++ = u; 527 } else { 528 // do not unescape 529 *q++ = p[i]; 530 *q++ = p[i + 1]; 531 *q++ = p[i + 2]; 532 } 533 i += 2; 534 } else { 535 *q++ = c; 536 537 // Check for "xn--" in an efficient, non-case-sensitive, way. 538 if (c == '-' && i >= 3 && !needsHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-') 539 needsHostNameDecoding = true; 540 } 541 } 542 *q = '\0'; 543 544 // Check string to see if it can be converted to display using UTF-8 545 NSString *result = [NSString stringWithUTF8String:after]; 546 if (!result) { 547 // Could not convert to UTF-8. 548 // Convert characters greater than 0x7f to escape sequences. 549 // Shift current string to the end of the buffer 550 // then we will copy back bytes to the start of the buffer 551 // as we convert. 552 int afterlength = q - after; 553 char *p = after + bufferLength - afterlength - 1; 554 memmove(p, after, afterlength + 1); // copies trailing '\0' 555 char *q = after; 556 while (*p) { 557 unsigned char c = *p; 558 if (c > 0x7f) { 559 *q++ = '%'; 560 *q++ = hexDigit(c >> 4); 561 *q++ = hexDigit(c & 0xf); 562 } else { 563 *q++ = *p; 564 } 565 p++; 566 } 567 *q = '\0'; 568 result = [NSString stringWithUTF8String:after]; 569 } 570 571 free(after); 572 573 result = mapHostNames(result, !needsHostNameDecoding); 574 return WebCFAutorelease(createStringWithEscapedUnsafeCharacters((CFStringRef)result)); 575} 576 577- (BOOL)_web_isEmpty 578{ 579 if (!CFURLGetBaseURL((CFURLRef)self)) 580 return CFURLGetBytes((CFURLRef)self, NULL, 0) == 0; 581 return [[self _web_originalData] length] == 0; 582} 583 584- (const char *)_web_URLCString 585{ 586 NSMutableData *data = [NSMutableData data]; 587 [data appendData:[self _web_originalData]]; 588 [data appendBytes:"\0" length:1]; 589 return (const char *)[data bytes]; 590 } 591 592- (NSURL *)_webkit_canonicalize 593{ 594 NSURLRequest *request = [[NSURLRequest alloc] initWithURL:self]; 595 Class concreteClass = WKNSURLProtocolClassForRequest(request); 596 if (!concreteClass) { 597 [request release]; 598 return self; 599 } 600 601 // This applies NSURL's concept of canonicalization, but not KURL's concept. It would 602 // make sense to apply both, but when we tried that it caused a performance degradation 603 // (see 5315926). It might make sense to apply only the KURL concept and not the NSURL 604 // concept, but it's too risky to make that change for WebKit 3.0. 605 NSURLRequest *newRequest = [concreteClass canonicalRequestForRequest:request]; 606 NSURL *newURL = [newRequest URL]; 607 NSURL *result = [[newURL retain] autorelease]; 608 [request release]; 609 610 return result; 611} 612 613typedef struct { 614 NSString *scheme; 615 NSString *user; 616 NSString *password; 617 NSString *host; 618 CFIndex port; // kCFNotFound means ignore/omit 619 NSString *path; 620 NSString *query; 621 NSString *fragment; 622} WebKitURLComponents; 623 624- (NSURL *)_webkit_URLByRemovingComponent:(CFURLComponentType)component 625{ 626 CFRange fragRg = CFURLGetByteRangeForComponent((CFURLRef)self, component, NULL); 627 // Check to see if a fragment exists before decomposing the URL. 628 if (fragRg.location == kCFNotFound) 629 return self; 630 631 UInt8 *urlBytes, buffer[2048]; 632 CFIndex numBytes = CFURLGetBytes((CFURLRef)self, buffer, 2048); 633 if (numBytes == -1) { 634 numBytes = CFURLGetBytes((CFURLRef)self, NULL, 0); 635 urlBytes = static_cast<UInt8*>(malloc(numBytes)); 636 CFURLGetBytes((CFURLRef)self, urlBytes, numBytes); 637 } else 638 urlBytes = buffer; 639 640 NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingUTF8, NULL)); 641 if (!result) 642 result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingISOLatin1, NULL)); 643 644 if (urlBytes != buffer) free(urlBytes); 645 return result ? [result autorelease] : self; 646} 647 648- (NSURL *)_webkit_URLByRemovingFragment 649{ 650 return [self _webkit_URLByRemovingComponent:kCFURLComponentFragment]; 651} 652 653- (NSURL *)_webkit_URLByRemovingResourceSpecifier 654{ 655 return [self _webkit_URLByRemovingComponent:kCFURLComponentResourceSpecifier]; 656} 657 658- (BOOL)_webkit_isJavaScriptURL 659{ 660 return [[self _web_originalDataAsString] _webkit_isJavaScriptURL]; 661} 662 663- (NSString *)_webkit_scriptIfJavaScriptURL 664{ 665 return [[self absoluteString] _webkit_scriptIfJavaScriptURL]; 666} 667 668- (BOOL)_webkit_isFileURL 669{ 670 return [[self _web_originalDataAsString] _webkit_isFileURL]; 671} 672 673- (BOOL)_webkit_isFTPDirectoryURL 674{ 675 return [[self _web_originalDataAsString] _webkit_isFTPDirectoryURL]; 676} 677 678- (BOOL)_webkit_shouldLoadAsEmptyDocument 679{ 680 return [[self _web_originalDataAsString] _webkit_hasCaseInsensitivePrefix:@"about:"] || [self _web_isEmpty]; 681} 682 683- (NSURL *)_web_URLWithLowercasedScheme 684{ 685 CFRange range; 686 CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &range); 687 if (range.location == kCFNotFound) { 688 return self; 689 } 690 691 UInt8 static_buffer[URL_BYTES_BUFFER_LENGTH]; 692 UInt8 *buffer = static_buffer; 693 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH); 694 if (bytesFilled == -1) { 695 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0); 696 buffer = static_cast<UInt8 *>(malloc(bytesToAllocate)); 697 bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate); 698 ASSERT(bytesFilled == bytesToAllocate); 699 } 700 701 int i; 702 BOOL changed = NO; 703 for (i = 0; i < range.length; ++i) { 704 char c = buffer[range.location + i]; 705 char lower = toASCIILower(c); 706 if (c != lower) { 707 buffer[range.location + i] = lower; 708 changed = YES; 709 } 710 } 711 712 NSURL *result = changed 713 ? (NSURL *)WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, buffer, bytesFilled, kCFStringEncodingUTF8, nil, YES)) 714 : (NSURL *)self; 715 716 if (buffer != static_buffer) { 717 free(buffer); 718 } 719 720 return result; 721} 722 723 724-(BOOL)_web_hasQuestionMarkOnlyQueryString 725{ 726 CFRange rangeWithSeparators; 727 CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentQuery, &rangeWithSeparators); 728 if (rangeWithSeparators.location != kCFNotFound && rangeWithSeparators.length == 1) { 729 return YES; 730 } 731 return NO; 732} 733 734-(NSData *)_web_schemeSeparatorWithoutColon 735{ 736 NSData *result = nil; 737 CFRange rangeWithSeparators; 738 CFRange range = CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &rangeWithSeparators); 739 if (rangeWithSeparators.location != kCFNotFound) { 740 NSString *absoluteString = [self absoluteString]; 741 NSRange separatorsRange = NSMakeRange(range.location + range.length + 1, rangeWithSeparators.length - range.length - 1); 742 if (separatorsRange.location + separatorsRange.length <= [absoluteString length]) { 743 NSString *slashes = [absoluteString substringWithRange:separatorsRange]; 744 result = [slashes dataUsingEncoding:NSISOLatin1StringEncoding]; 745 } 746 } 747 return result; 748} 749 750#define completeURL (CFURLComponentType)-1 751 752-(NSData *)_web_dataForURLComponentType:(CFURLComponentType)componentType 753{ 754 static int URLComponentTypeBufferLength = 2048; 755 756 UInt8 staticAllBytesBuffer[URLComponentTypeBufferLength]; 757 UInt8 *allBytesBuffer = staticAllBytesBuffer; 758 759 CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, URLComponentTypeBufferLength); 760 if (bytesFilled == -1) { 761 CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0); 762 allBytesBuffer = static_cast<UInt8 *>(malloc(bytesToAllocate)); 763 bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, bytesToAllocate); 764 } 765 766 CFRange range; 767 if (componentType != completeURL) { 768 range = CFURLGetByteRangeForComponent((CFURLRef)self, componentType, NULL); 769 if (range.location == kCFNotFound) { 770 return nil; 771 } 772 } 773 else { 774 range.location = 0; 775 range.length = bytesFilled; 776 } 777 778 NSData *componentData = [NSData dataWithBytes:allBytesBuffer + range.location length:range.length]; 779 780 const unsigned char *bytes = static_cast<const unsigned char *>([componentData bytes]); 781 NSMutableData *resultData = [NSMutableData data]; 782 // NOTE: add leading '?' to query strings non-zero length query strings. 783 // NOTE: retain question-mark only query strings. 784 if (componentType == kCFURLComponentQuery) { 785 if (range.length > 0 || [self _web_hasQuestionMarkOnlyQueryString]) { 786 [resultData appendBytes:"?" length:1]; 787 } 788 } 789 int i; 790 for (i = 0; i < range.length; i++) { 791 unsigned char c = bytes[i]; 792 if (c <= 0x20 || c >= 0x7f) { 793 char escaped[3]; 794 escaped[0] = '%'; 795 escaped[1] = hexDigit(c >> 4); 796 escaped[2] = hexDigit(c & 0xf); 797 [resultData appendBytes:escaped length:3]; 798 } 799 else { 800 char b[1]; 801 b[0] = c; 802 [resultData appendBytes:b length:1]; 803 } 804 } 805 806 if (staticAllBytesBuffer != allBytesBuffer) { 807 free(allBytesBuffer); 808 } 809 810 return resultData; 811} 812 813-(NSData *)_web_schemeData 814{ 815 return [self _web_dataForURLComponentType:kCFURLComponentScheme]; 816} 817 818-(NSData *)_web_hostData 819{ 820 NSData *result = [self _web_dataForURLComponentType:kCFURLComponentHost]; 821 NSData *scheme = [self _web_schemeData]; 822 // Take off localhost for file 823 if ([scheme _web_isCaseInsensitiveEqualToCString:"file"]) { 824 return ([result _web_isCaseInsensitiveEqualToCString:"localhost"]) ? nil : result; 825 } 826 return result; 827} 828 829- (NSString *)_web_hostString 830{ 831 NSData *data = [self _web_hostData]; 832 if (!data) { 833 data = [NSData data]; 834 } 835 return [[[NSString alloc] initWithData:[self _web_hostData] encoding:NSUTF8StringEncoding] autorelease]; 836} 837 838- (NSString *)_webkit_suggestedFilenameWithMIMEType:(NSString *)MIMEType 839{ 840 return suggestedFilenameWithMIMEType(self, MIMEType); 841} 842 843@end 844 845@implementation NSString (WebNSURLExtras) 846 847- (BOOL)_web_isUserVisibleURL 848{ 849 BOOL valid = YES; 850 // get buffer 851 852 char static_buffer[1024]; 853 const char *p; 854 BOOL success = CFStringGetCString((CFStringRef)self, static_buffer, 1023, kCFStringEncodingUTF8); 855 if (success) { 856 p = static_buffer; 857 } else { 858 p = [self UTF8String]; 859 } 860 861 int length = strlen(p); 862 863 // check for characters <= 0x20 or >=0x7f, %-escape sequences of %7f, and xn--, these 864 // are the things that will lead _web_userVisibleString to actually change things. 865 int i; 866 for (i = 0; i < length; i++) { 867 unsigned char c = p[i]; 868 // escape control characters, space, and delete 869 if (c <= 0x20 || c == 0x7f) { 870 valid = NO; 871 break; 872 } else if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) { 873 unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]); 874 if (u > 0x7f) { 875 valid = NO; 876 break; 877 } 878 i += 2; 879 } else { 880 // Check for "xn--" in an efficient, non-case-sensitive, way. 881 if (c == '-' && i >= 3 && (p[i - 3] | 0x20) == 'x' && (p[i - 2] | 0x20) == 'n' && p[i - 1] == '-') { 882 valid = NO; 883 break; 884 } 885 } 886 } 887 888 return valid; 889} 890 891 892- (BOOL)_webkit_isJavaScriptURL 893{ 894 return [self _webkit_hasCaseInsensitivePrefix:@"javascript:"]; 895} 896 897- (BOOL)_webkit_isFileURL 898{ 899 return [self rangeOfString:@"file:" options:(NSCaseInsensitiveSearch | NSAnchoredSearch)].location != NSNotFound; 900} 901 902- (NSString *)_webkit_stringByReplacingValidPercentEscapes 903{ 904 return decodeURLEscapeSequences(self); 905} 906 907- (NSString *)_webkit_scriptIfJavaScriptURL 908{ 909 if (![self _webkit_isJavaScriptURL]) { 910 return nil; 911 } 912 return [[self substringFromIndex:11] _webkit_stringByReplacingValidPercentEscapes]; 913} 914 915- (BOOL)_webkit_isFTPDirectoryURL 916{ 917 int length = [self length]; 918 if (length < 5) { // 5 is length of "ftp:/" 919 return NO; 920 } 921 unichar lastChar = [self characterAtIndex:length - 1]; 922 return lastChar == '/' && [self _webkit_hasCaseInsensitivePrefix:@"ftp:"]; 923} 924 925 926static BOOL readIDNScriptWhiteListFile(NSString *filename) 927{ 928 if (!filename) { 929 return NO; 930 } 931 FILE *file = fopen([filename fileSystemRepresentation], "r"); 932 if (file == NULL) { 933 return NO; 934 } 935 936 // Read a word at a time. 937 // Allow comments, starting with # character to the end of the line. 938 while (1) { 939 // Skip a comment if present. 940 int result = fscanf(file, " #%*[^\n\r]%*[\n\r]"); 941 if (result == EOF) { 942 break; 943 } 944 945 // Read a script name if present. 946 char word[33]; 947 result = fscanf(file, " %32[^# \t\n\r]%*[^# \t\n\r] ", word); 948 if (result == EOF) { 949 break; 950 } 951 if (result == 1) { 952 // Got a word, map to script code and put it into the array. 953 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word); 954 if (script >= 0 && script < USCRIPT_CODE_LIMIT) { 955 size_t index = script / 32; 956 uint32_t mask = 1 << (script % 32); 957 IDNScriptWhiteList[index] |= mask; 958 } 959 } 960 } 961 fclose(file); 962 return YES; 963} 964 965static void readIDNScriptWhiteList(void) 966{ 967 // Read white list from library. 968 NSArray *dirs = NSSearchPathForDirectoriesInDomains(NSLibraryDirectory, NSAllDomainsMask, YES); 969 int i, numDirs = [dirs count]; 970 for (i = 0; i < numDirs; i++) { 971 NSString *dir = [dirs objectAtIndex:i]; 972 if (readIDNScriptWhiteListFile([dir stringByAppendingPathComponent:@"IDNScriptWhiteList.txt"])) { 973 return; 974 } 975 } 976 977 // Fall back on white list inside bundle. 978 NSBundle *bundle = [NSBundle bundleWithIdentifier:@"com.apple.WebKit"]; 979 readIDNScriptWhiteListFile([bundle pathForResource:@"IDNScriptWhiteList" ofType:@"txt"]); 980} 981 982static BOOL allCharactersInIDNScriptWhiteList(const UChar *buffer, int32_t length) 983{ 984 pthread_once(&IDNScriptWhiteListFileRead, readIDNScriptWhiteList); 985 986 int32_t i = 0; 987 while (i < length) { 988 UChar32 c; 989 U16_NEXT(buffer, i, length, c) 990 UErrorCode error = U_ZERO_ERROR; 991 UScriptCode script = uscript_getScript(c, &error); 992 if (error != U_ZERO_ERROR) { 993 LOG_ERROR("got ICU error while trying to look at scripts: %d", error); 994 return NO; 995 } 996 if (script < 0) { 997 LOG_ERROR("got negative number for script code from ICU: %d", script); 998 return NO; 999 } 1000 if (script >= USCRIPT_CODE_LIMIT) { 1001 return NO; 1002 } 1003 size_t index = script / 32; 1004 uint32_t mask = 1 << (script % 32); 1005 if (!(IDNScriptWhiteList[index] & mask)) { 1006 return NO; 1007 } 1008 1009 if (isLookalikeCharacter(c)) 1010 return NO; 1011 } 1012 return YES; 1013} 1014 1015// Return value of nil means no mapping is necessary. 1016// If makeString is NO, then return value is either nil or self to indicate mapping is necessary. 1017// If makeString is YES, then return value is either nil or the mapped string. 1018- (NSString *)_web_mapHostNameWithRange:(NSRange)range encode:(BOOL)encode makeString:(BOOL)makeString 1019{ 1020 if (range.length > HOST_NAME_BUFFER_LENGTH) { 1021 return nil; 1022 } 1023 1024 if ([self length] == 0) 1025 return nil; 1026 1027 UChar sourceBuffer[HOST_NAME_BUFFER_LENGTH]; 1028 UChar destinationBuffer[HOST_NAME_BUFFER_LENGTH]; 1029 1030 NSString *string = self; 1031 if (encode && [self rangeOfString:@"%" options:NSLiteralSearch range:range].location != NSNotFound) { 1032 NSString *substring = [self substringWithRange:range]; 1033 substring = WebCFAutorelease(CFURLCreateStringByReplacingPercentEscapes(NULL, (CFStringRef)substring, CFSTR(""))); 1034 if (substring != nil) { 1035 string = substring; 1036 range = NSMakeRange(0, [string length]); 1037 } 1038 } 1039 1040 int length = range.length; 1041 [string getCharacters:sourceBuffer range:range]; 1042 1043 UErrorCode error = U_ZERO_ERROR; 1044 int32_t numCharactersConverted = (encode ? uidna_IDNToASCII : uidna_IDNToUnicode) 1045 (sourceBuffer, length, destinationBuffer, HOST_NAME_BUFFER_LENGTH, UIDNA_ALLOW_UNASSIGNED, NULL, &error); 1046 if (error != U_ZERO_ERROR) { 1047 return nil; 1048 } 1049 if (numCharactersConverted == length && memcmp(sourceBuffer, destinationBuffer, length * sizeof(UChar)) == 0) { 1050 return nil; 1051 } 1052 if (!encode && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted)) { 1053 return nil; 1054 } 1055 return makeString ? (NSString *)[NSString stringWithCharacters:destinationBuffer length:numCharactersConverted] : (NSString *)self; 1056} 1057 1058- (BOOL)_web_hostNameNeedsDecodingWithRange:(NSRange)range 1059{ 1060 return [self _web_mapHostNameWithRange:range encode:NO makeString:NO] != nil; 1061} 1062 1063- (BOOL)_web_hostNameNeedsEncodingWithRange:(NSRange)range 1064{ 1065 return [self _web_mapHostNameWithRange:range encode:YES makeString:NO] != nil; 1066} 1067 1068- (NSString *)_web_decodeHostNameWithRange:(NSRange)range 1069{ 1070 return [self _web_mapHostNameWithRange:range encode:NO makeString:YES]; 1071} 1072 1073- (NSString *)_web_encodeHostNameWithRange:(NSRange)range 1074{ 1075 return [self _web_mapHostNameWithRange:range encode:YES makeString:YES]; 1076} 1077 1078- (NSString *)_web_decodeHostName 1079{ 1080 NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:NO makeString:YES]; 1081 return name == nil ? self : name; 1082} 1083 1084- (NSString *)_web_encodeHostName 1085{ 1086 NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:YES makeString:YES]; 1087 return name == nil ? self : name; 1088} 1089 1090-(NSRange)_webkit_rangeOfURLScheme 1091{ 1092 NSRange colon = [self rangeOfString:@":"]; 1093 if (colon.location != NSNotFound && colon.location > 0) { 1094 NSRange scheme = {0, colon.location}; 1095 static NSCharacterSet *InverseSchemeCharacterSet = nil; 1096 if (!InverseSchemeCharacterSet) { 1097 /* 1098 This stuff is very expensive. 10-15 msec on a 2x1.2GHz. If not cached it swamps 1099 everything else when adding items to the autocomplete DB. Makes me wonder if we 1100 even need to enforce the character set here. 1101 */ 1102 NSString *acceptableCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; 1103 InverseSchemeCharacterSet = [[[NSCharacterSet characterSetWithCharactersInString:acceptableCharacters] invertedSet] retain]; 1104 } 1105 NSRange illegals = [self rangeOfCharacterFromSet:InverseSchemeCharacterSet options:0 range:scheme]; 1106 if (illegals.location == NSNotFound) 1107 return scheme; 1108 } 1109 return NSMakeRange(NSNotFound, 0); 1110} 1111 1112-(BOOL)_webkit_looksLikeAbsoluteURL 1113{ 1114 // Trim whitespace because _web_URLWithString allows whitespace. 1115 return [[self _webkit_stringByTrimmingWhitespace] _webkit_rangeOfURLScheme].location != NSNotFound; 1116} 1117 1118- (NSString *)_webkit_URLFragment 1119{ 1120 NSRange fragmentRange; 1121 1122 fragmentRange = [self rangeOfString:@"#" options:NSLiteralSearch]; 1123 if (fragmentRange.location == NSNotFound) 1124 return nil; 1125 return [self substringFromIndex:fragmentRange.location + 1]; 1126} 1127 1128@end 1129