• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2005, 2007, 2008 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov (ap@nypop.com)
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1.  Redistributions of source code must retain the above copyright
10 *     notice, this list of conditions and the following disclaimer.
11 * 2.  Redistributions in binary form must reproduce the above copyright
12 *     notice, this list of conditions and the following disclaimer in the
13 *     documentation and/or other materials provided with the distribution.
14 * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
15 *     its contributors may be used to endorse or promote products derived
16 *     from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#import "WebNSURLExtras.h"
31
32#import "WebKitNSStringExtras.h"
33#import "WebLocalizableStrings.h"
34#import "WebNSDataExtras.h"
35#import "WebNSObjectExtras.h"
36#import "WebSystemInterface.h"
37#import <Foundation/NSURLRequest.h>
38#import <WebCore/KURL.h>
39#import <WebCore/LoaderNSURLExtras.h>
40#import <WebKitSystemInterface.h>
41#import <wtf/Assertions.h>
42#import <unicode/uchar.h>
43#import <unicode/uidna.h>
44#import <unicode/uscript.h>
45
46using namespace WebCore;
47using namespace WTF;
48
49typedef void (* StringRangeApplierFunction)(NSString *string, NSRange range, void *context);
50
51// Needs to be big enough to hold an IDN-encoded name.
52// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
53#define HOST_NAME_BUFFER_LENGTH 2048
54
55#define URL_BYTES_BUFFER_LENGTH 2048
56
57static pthread_once_t IDNScriptWhiteListFileRead = PTHREAD_ONCE_INIT;
58static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + 31) / 32];
59
60static inline BOOL isLookalikeCharacter(int charCode)
61{
62// FIXME: Move this code down into WebCore so it can be shared with other platforms.
63
64// This function treats the following as unsafe, lookalike characters:
65// any non-printable character, any character considered as whitespace that isn't already converted to a space by ICU,
66// and any ignorable character.
67
68// We also considered the characters in Mozilla's blacklist (http://kb.mozillazine.org/Network.IDN.blacklist_chars),
69// and included all of these characters that ICU can encode.
70
71    if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
72        return YES;
73
74    switch (charCode) {
75        case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */
76        case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */
77        case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */
78        case 0x05B4: /* HEBREW POINT HIRIQ */
79        case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */
80        case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */
81        case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */
82        case 0x0660: /* ARABIC INDIC DIGIT ZERO */
83        case 0x06D4: /* ARABIC FULL STOP */
84        case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */
85        case 0x2027: /* HYPHENATION POINT */
86        case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
87        case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
88        case 0x2044: /* FRACTION SLASH */
89        case 0x2215: /* DIVISION SLASH */
90        case 0x23ae: /* INTEGRAL EXTENSION */
91        case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */
92        case 0x29F8: /* BIG SOLIDUS */
93        case 0x29f6: /* SOLIDUS WITH OVERBAR */
94        case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */
95        case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */
96        case 0x3008: /* LEFT ANGLE BRACKET */
97        case 0x3014: /* LEFT TORTOISE SHELL BRACKET */
98        case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */
99        case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */
100        case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */
101        case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */
102        case 0x33DF: /* SQUARE A OVER M */
103        case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */
104        case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
105        case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */
106        case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */
107        case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */
108            return YES;
109        default:
110            return NO;
111    }
112}
113
114static char hexDigit(int i)
115{
116    if (i < 0 || i > 16) {
117        LOG_ERROR("illegal hex digit");
118        return '0';
119    }
120    int h = i;
121    if (h >= 10) {
122        h = h - 10 + 'A';
123    }
124    else {
125        h += '0';
126    }
127    return h;
128}
129
130static BOOL isHexDigit(char c)
131{
132    return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
133}
134
135static int hexDigitValue(char c)
136{
137    if (c >= '0' && c <= '9') {
138        return c - '0';
139    }
140    if (c >= 'A' && c <= 'F') {
141        return c - 'A' + 10;
142    }
143    if (c >= 'a' && c <= 'f') {
144        return c - 'a' + 10;
145    }
146    LOG_ERROR("illegal hex digit");
147    return 0;
148}
149
150static void applyHostNameFunctionToMailToURLString(NSString *string, StringRangeApplierFunction f, void *context)
151{
152    // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character.
153    // Skip quoted strings so that characters in them don't confuse us.
154    // When we find a '?' character, we are past the part of the URL that contains host names.
155
156    static NSCharacterSet *hostNameOrStringStartCharacters;
157    if (hostNameOrStringStartCharacters == nil) {
158        hostNameOrStringStartCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"@?"];
159        CFRetain(hostNameOrStringStartCharacters);
160    }
161    static NSCharacterSet *hostNameEndCharacters;
162    if (hostNameEndCharacters == nil) {
163        hostNameEndCharacters = [NSCharacterSet characterSetWithCharactersInString:@">,?"];
164        CFRetain(hostNameEndCharacters);
165    }
166    static NSCharacterSet *quotedStringCharacters;
167    if (quotedStringCharacters == nil) {
168        quotedStringCharacters = [NSCharacterSet characterSetWithCharactersInString:@"\"\\"];
169        CFRetain(quotedStringCharacters);
170    }
171
172    unsigned stringLength = [string length];
173    NSRange remaining = NSMakeRange(0, stringLength);
174
175    while (1) {
176        // Find start of host name or of quoted string.
177        NSRange hostNameOrStringStart = [string rangeOfCharacterFromSet:hostNameOrStringStartCharacters options:0 range:remaining];
178        if (hostNameOrStringStart.location == NSNotFound) {
179            return;
180        }
181        unichar c = [string characterAtIndex:hostNameOrStringStart.location];
182        remaining.location = NSMaxRange(hostNameOrStringStart);
183        remaining.length = stringLength - remaining.location;
184
185        if (c == '?') {
186            return;
187        }
188
189        if (c == '@') {
190            // Find end of host name.
191            unsigned hostNameStart = remaining.location;
192            NSRange hostNameEnd = [string rangeOfCharacterFromSet:hostNameEndCharacters options:0 range:remaining];
193            BOOL done;
194            if (hostNameEnd.location == NSNotFound) {
195                hostNameEnd.location = stringLength;
196                done = YES;
197            } else {
198                remaining.location = hostNameEnd.location;
199                remaining.length = stringLength - remaining.location;
200                done = NO;
201            }
202
203            // Process host name range.
204            f(string, NSMakeRange(hostNameStart, hostNameEnd.location - hostNameStart), context);
205
206            if (done) {
207                return;
208            }
209        } else {
210            // Skip quoted string.
211            ASSERT(c == '"');
212            while (1) {
213                NSRange escapedCharacterOrStringEnd = [string rangeOfCharacterFromSet:quotedStringCharacters options:0 range:remaining];
214                if (escapedCharacterOrStringEnd.location == NSNotFound) {
215                    return;
216                }
217                c = [string characterAtIndex:escapedCharacterOrStringEnd.location];
218                remaining.location = NSMaxRange(escapedCharacterOrStringEnd);
219                remaining.length = stringLength - remaining.location;
220
221                // If we are the end of the string, then break from the string loop back to the host name loop.
222                if (c == '"') {
223                    break;
224                }
225
226                // Skip escaped character.
227                ASSERT(c == '\\');
228                if (remaining.length == 0) {
229                    return;
230                }
231                remaining.location += 1;
232                remaining.length -= 1;
233            }
234        }
235    }
236}
237
238static void applyHostNameFunctionToURLString(NSString *string, StringRangeApplierFunction f, void *context)
239{
240    // Find hostnames. Too bad we can't use any real URL-parsing code to do this,
241    // but we have to do it before doing all the %-escaping, and this is the only
242    // code we have that parses mailto URLs anyway.
243
244    // Maybe we should implement this using a character buffer instead?
245
246    if ([string _webkit_hasCaseInsensitivePrefix:@"mailto:"]) {
247        applyHostNameFunctionToMailToURLString(string, f, context);
248        return;
249    }
250
251    // Find the host name in a hierarchical URL.
252    // It comes after a "://" sequence, with scheme characters preceding.
253    // If ends with the end of the string or a ":", "/", or a "?".
254    // If there is a "@" character, the host part is just the part after the "@".
255    NSRange separatorRange = [string rangeOfString:@"://"];
256    if (separatorRange.location == NSNotFound) {
257        return;
258    }
259
260    // Check that all characters before the :// are valid scheme characters.
261    static NSCharacterSet *nonSchemeCharacters;
262    if (nonSchemeCharacters == nil) {
263        nonSchemeCharacters = [[NSCharacterSet characterSetWithCharactersInString:@"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-."] invertedSet];
264        CFRetain(nonSchemeCharacters);
265    }
266    if ([string rangeOfCharacterFromSet:nonSchemeCharacters options:0 range:NSMakeRange(0, separatorRange.location)].location != NSNotFound) {
267        return;
268    }
269
270    unsigned stringLength = [string length];
271
272    static NSCharacterSet *hostTerminators;
273    if (hostTerminators == nil) {
274        hostTerminators = [NSCharacterSet characterSetWithCharactersInString:@":/?#"];
275        CFRetain(hostTerminators);
276    }
277
278    // Start after the separator.
279    unsigned authorityStart = NSMaxRange(separatorRange);
280
281    // Find terminating character.
282    NSRange hostNameTerminator = [string rangeOfCharacterFromSet:hostTerminators options:0 range:NSMakeRange(authorityStart, stringLength - authorityStart)];
283    unsigned hostNameEnd = hostNameTerminator.location == NSNotFound ? stringLength : hostNameTerminator.location;
284
285    // Find "@" for the start of the host name.
286    NSRange userInfoTerminator = [string rangeOfString:@"@" options:0 range:NSMakeRange(authorityStart, hostNameEnd - authorityStart)];
287    unsigned hostNameStart = userInfoTerminator.location == NSNotFound ? authorityStart : NSMaxRange(userInfoTerminator);
288
289    f(string, NSMakeRange(hostNameStart, hostNameEnd - hostNameStart), context);
290}
291
292@implementation NSURL (WebNSURLExtras)
293
294static void collectRangesThatNeedMapping(NSString *string, NSRange range, void *context, BOOL encode)
295{
296    BOOL needsMapping = encode
297        ? [string _web_hostNameNeedsEncodingWithRange:range]
298        : [string _web_hostNameNeedsDecodingWithRange:range];
299    if (!needsMapping) {
300        return;
301    }
302
303    NSMutableArray **array = (NSMutableArray **)context;
304    if (*array == nil) {
305        *array = [[NSMutableArray alloc] init];
306    }
307
308    [*array addObject:[NSValue valueWithRange:range]];
309}
310
311static void collectRangesThatNeedEncoding(NSString *string, NSRange range, void *context)
312{
313    return collectRangesThatNeedMapping(string, range, context, YES);
314}
315
316static void collectRangesThatNeedDecoding(NSString *string, NSRange range, void *context)
317{
318    return collectRangesThatNeedMapping(string, range, context, NO);
319}
320
321static NSString *mapHostNames(NSString *string, BOOL encode)
322{
323    // Generally, we want to optimize for the case where there is one host name that does not need mapping.
324
325    if (encode && [string canBeConvertedToEncoding:NSASCIIStringEncoding])
326        return string;
327
328    // Make a list of ranges that actually need mapping.
329    NSMutableArray *hostNameRanges = nil;
330    StringRangeApplierFunction f = encode
331        ? collectRangesThatNeedEncoding
332        : collectRangesThatNeedDecoding;
333    applyHostNameFunctionToURLString(string, f, &hostNameRanges);
334    if (hostNameRanges == nil)
335        return string;
336
337    // Do the mapping.
338    NSMutableString *mutableCopy = [string mutableCopy];
339    unsigned i = [hostNameRanges count];
340    while (i-- != 0) {
341        NSRange hostNameRange = [[hostNameRanges objectAtIndex:i] rangeValue];
342        NSString *mappedHostName = encode
343            ? [string _web_encodeHostNameWithRange:hostNameRange]
344            : [string _web_decodeHostNameWithRange:hostNameRange];
345        [mutableCopy replaceCharactersInRange:hostNameRange withString:mappedHostName];
346    }
347    [hostNameRanges release];
348    return [mutableCopy autorelease];
349}
350
351+ (NSURL *)_web_URLWithUserTypedString:(NSString *)string relativeToURL:(NSURL *)URL
352{
353    if (string == nil) {
354        return nil;
355    }
356    string = mapHostNames([string _webkit_stringByTrimmingWhitespace], YES);
357
358    NSData *userTypedData = [string dataUsingEncoding:NSUTF8StringEncoding];
359    ASSERT(userTypedData);
360
361    const UInt8 *inBytes = static_cast<const UInt8 *>([userTypedData bytes]);
362    int inLength = [userTypedData length];
363    if (inLength == 0) {
364        return [NSURL URLWithString:@""];
365    }
366
367    char *outBytes = static_cast<char *>(malloc(inLength * 3)); // large enough to %-escape every character
368    char *p = outBytes;
369    int outLength = 0;
370    int i;
371    for (i = 0; i < inLength; i++) {
372        UInt8 c = inBytes[i];
373        if (c <= 0x20 || c >= 0x7f) {
374            *p++ = '%';
375            *p++ = hexDigit(c >> 4);
376            *p++ = hexDigit(c & 0xf);
377            outLength += 3;
378        }
379        else {
380            *p++ = c;
381            outLength++;
382        }
383    }
384
385    NSData *data = [NSData dataWithBytesNoCopy:outBytes length:outLength]; // adopts outBytes
386    return [self _web_URLWithData:data relativeToURL:URL];
387}
388
389+ (NSURL *)_web_URLWithUserTypedString:(NSString *)string
390{
391    return [self _web_URLWithUserTypedString:string relativeToURL:nil];
392}
393
394+ (NSURL *)_web_URLWithDataAsString:(NSString *)string
395{
396    if (string == nil) {
397        return nil;
398    }
399    return [self _web_URLWithDataAsString:string relativeToURL:nil];
400}
401
402+ (NSURL *)_web_URLWithDataAsString:(NSString *)string relativeToURL:(NSURL *)baseURL
403{
404    if (string == nil) {
405        return nil;
406    }
407    string = [string _webkit_stringByTrimmingWhitespace];
408    NSData *data = [string dataUsingEncoding:NSISOLatin1StringEncoding];
409    return [self _web_URLWithData:data relativeToURL:baseURL];
410}
411
412+ (NSURL *)_web_URLWithData:(NSData *)data
413{
414    return [NSURL _web_URLWithData:data relativeToURL:nil];
415}
416
417+ (NSURL *)_web_URLWithData:(NSData *)data relativeToURL:(NSURL *)baseURL
418{
419    if (data == nil)
420        return nil;
421
422    NSURL *result = nil;
423    size_t length = [data length];
424    if (length > 0) {
425        // work around <rdar://4470771>: CFURLCreateAbsoluteURLWithBytes(.., TRUE) doesn't remove non-path components.
426        baseURL = [baseURL _webkit_URLByRemovingResourceSpecifier];
427
428        const UInt8 *bytes = static_cast<const UInt8*>([data bytes]);
429        // NOTE: We use UTF-8 here since this encoding is used when computing strings when returning URL components
430        // (e.g calls to NSURL -path). However, this function is not tolerant of illegal UTF-8 sequences, which
431        // could either be a malformed string or bytes in a different encoding, like shift-jis, so we fall back
432        // onto using ISO Latin 1 in those cases.
433        result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingUTF8, (CFURLRef)baseURL, YES));
434        if (!result)
435            result = WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, bytes, length, kCFStringEncodingISOLatin1, (CFURLRef)baseURL, YES));
436    } else
437        result = [NSURL URLWithString:@""];
438
439    return result;
440}
441
442- (NSData *)_web_originalData
443{
444    UInt8 *buffer = (UInt8 *)malloc(URL_BYTES_BUFFER_LENGTH);
445    CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH);
446    if (bytesFilled == -1) {
447        CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0);
448        buffer = (UInt8 *)realloc(buffer, bytesToAllocate);
449        bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate);
450        ASSERT(bytesFilled == bytesToAllocate);
451    }
452
453    // buffer is adopted by the NSData
454    NSData *data = [NSData dataWithBytesNoCopy:buffer length:bytesFilled freeWhenDone:YES];
455
456    NSURL *baseURL = (NSURL *)CFURLGetBaseURL((CFURLRef)self);
457    if (baseURL)
458        return [[NSURL _web_URLWithData:data relativeToURL:baseURL] _web_originalData];
459    return data;
460}
461
462- (NSString *)_web_originalDataAsString
463{
464    return [[[NSString alloc] initWithData:[self _web_originalData] encoding:NSISOLatin1StringEncoding] autorelease];
465}
466
467static CFStringRef createStringWithEscapedUnsafeCharacters(CFStringRef string)
468{
469    CFIndex length = CFStringGetLength(string);
470    Vector<UChar, 2048> sourceBuffer(length);
471    CFStringGetCharacters(string, CFRangeMake(0, length), sourceBuffer.data());
472
473    Vector<UChar, 2048> outBuffer;
474
475    CFIndex i = 0;
476    while (i < length) {
477        UChar32 c;
478        U16_NEXT(sourceBuffer, i, length, c)
479
480        if (isLookalikeCharacter(c)) {
481            uint8_t utf8Buffer[4];
482            CFIndex offset = 0;
483            UBool failure = false;
484            U8_APPEND(utf8Buffer, offset, 4, c, failure)
485            ASSERT(!failure);
486
487            for (CFIndex j = 0; j < offset; ++j) {
488                outBuffer.append('%');
489                outBuffer.append(hexDigit(utf8Buffer[j] >> 4));
490                outBuffer.append(hexDigit(utf8Buffer[j] & 0xf));
491            }
492        } else {
493            UChar utf16Buffer[2];
494            CFIndex offset = 0;
495            UBool failure = false;
496            U16_APPEND(utf16Buffer, offset, 2, c, failure)
497            ASSERT(!failure);
498            for (CFIndex j = 0; j < offset; ++j)
499                outBuffer.append(utf16Buffer[j]);
500        }
501    }
502
503    return CFStringCreateWithCharacters(NULL, outBuffer.data(), outBuffer.size());
504}
505
506- (NSString *)_web_userVisibleString
507{
508    NSData *data = [self _web_originalData];
509    const unsigned char *before = static_cast<const unsigned char*>([data bytes]);
510    int length = [data length];
511
512    bool needsHostNameDecoding = false;
513
514    const unsigned char *p = before;
515    int bufferLength = (length * 3) + 1;
516    char *after = static_cast<char *>(malloc(bufferLength)); // large enough to %-escape every character
517    char *q = after;
518    int i;
519    for (i = 0; i < length; i++) {
520        unsigned char c = p[i];
521        // unescape escape sequences that indicate bytes greater than 0x7f
522        if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) {
523            unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]);
524            if (u > 0x7f) {
525                // unescape
526                *q++ = u;
527            } else {
528                // do not unescape
529                *q++ = p[i];
530                *q++ = p[i + 1];
531                *q++ = p[i + 2];
532            }
533            i += 2;
534        } else {
535            *q++ = c;
536
537            // Check for "xn--" in an efficient, non-case-sensitive, way.
538            if (c == '-' && i >= 3 && !needsHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-')
539                needsHostNameDecoding = true;
540        }
541    }
542    *q = '\0';
543
544    // Check string to see if it can be converted to display using UTF-8
545    NSString *result = [NSString stringWithUTF8String:after];
546    if (!result) {
547        // Could not convert to UTF-8.
548        // Convert characters greater than 0x7f to escape sequences.
549        // Shift current string to the end of the buffer
550        // then we will copy back bytes to the start of the buffer
551        // as we convert.
552        int afterlength = q - after;
553        char *p = after + bufferLength - afterlength - 1;
554        memmove(p, after, afterlength + 1); // copies trailing '\0'
555        char *q = after;
556        while (*p) {
557            unsigned char c = *p;
558            if (c > 0x7f) {
559                *q++ = '%';
560                *q++ = hexDigit(c >> 4);
561                *q++ = hexDigit(c & 0xf);
562            } else {
563                *q++ = *p;
564            }
565            p++;
566        }
567        *q = '\0';
568        result = [NSString stringWithUTF8String:after];
569    }
570
571    free(after);
572
573    result = mapHostNames(result, !needsHostNameDecoding);
574    return WebCFAutorelease(createStringWithEscapedUnsafeCharacters((CFStringRef)result));
575}
576
577- (BOOL)_web_isEmpty
578{
579    if (!CFURLGetBaseURL((CFURLRef)self))
580        return CFURLGetBytes((CFURLRef)self, NULL, 0) == 0;
581    return [[self _web_originalData] length] == 0;
582}
583
584- (const char *)_web_URLCString
585{
586    NSMutableData *data = [NSMutableData data];
587    [data appendData:[self _web_originalData]];
588    [data appendBytes:"\0" length:1];
589    return (const char *)[data bytes];
590 }
591
592- (NSURL *)_webkit_canonicalize
593{
594    NSURLRequest *request = [[NSURLRequest alloc] initWithURL:self];
595    Class concreteClass = WKNSURLProtocolClassForRequest(request);
596    if (!concreteClass) {
597        [request release];
598        return self;
599    }
600
601    // This applies NSURL's concept of canonicalization, but not KURL's concept. It would
602    // make sense to apply both, but when we tried that it caused a performance degradation
603    // (see 5315926). It might make sense to apply only the KURL concept and not the NSURL
604    // concept, but it's too risky to make that change for WebKit 3.0.
605    NSURLRequest *newRequest = [concreteClass canonicalRequestForRequest:request];
606    NSURL *newURL = [newRequest URL];
607    NSURL *result = [[newURL retain] autorelease];
608    [request release];
609
610    return result;
611}
612
613typedef struct {
614    NSString *scheme;
615    NSString *user;
616    NSString *password;
617    NSString *host;
618    CFIndex port; // kCFNotFound means ignore/omit
619    NSString *path;
620    NSString *query;
621    NSString *fragment;
622} WebKitURLComponents;
623
624- (NSURL *)_webkit_URLByRemovingComponent:(CFURLComponentType)component
625{
626    CFRange fragRg = CFURLGetByteRangeForComponent((CFURLRef)self, component, NULL);
627    // Check to see if a fragment exists before decomposing the URL.
628    if (fragRg.location == kCFNotFound)
629        return self;
630
631    UInt8 *urlBytes, buffer[2048];
632    CFIndex numBytes = CFURLGetBytes((CFURLRef)self, buffer, 2048);
633    if (numBytes == -1) {
634        numBytes = CFURLGetBytes((CFURLRef)self, NULL, 0);
635        urlBytes = static_cast<UInt8*>(malloc(numBytes));
636        CFURLGetBytes((CFURLRef)self, urlBytes, numBytes);
637    } else
638        urlBytes = buffer;
639
640    NSURL *result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingUTF8, NULL));
641    if (!result)
642        result = (NSURL *)CFMakeCollectable(CFURLCreateWithBytes(NULL, urlBytes, fragRg.location - 1, kCFStringEncodingISOLatin1, NULL));
643
644    if (urlBytes != buffer) free(urlBytes);
645    return result ? [result autorelease] : self;
646}
647
648- (NSURL *)_webkit_URLByRemovingFragment
649{
650    return [self _webkit_URLByRemovingComponent:kCFURLComponentFragment];
651}
652
653- (NSURL *)_webkit_URLByRemovingResourceSpecifier
654{
655    return [self _webkit_URLByRemovingComponent:kCFURLComponentResourceSpecifier];
656}
657
658- (BOOL)_webkit_isJavaScriptURL
659{
660    return [[self _web_originalDataAsString] _webkit_isJavaScriptURL];
661}
662
663- (NSString *)_webkit_scriptIfJavaScriptURL
664{
665    return [[self absoluteString] _webkit_scriptIfJavaScriptURL];
666}
667
668- (BOOL)_webkit_isFileURL
669{
670    return [[self _web_originalDataAsString] _webkit_isFileURL];
671}
672
673- (BOOL)_webkit_isFTPDirectoryURL
674{
675    return [[self _web_originalDataAsString] _webkit_isFTPDirectoryURL];
676}
677
678- (BOOL)_webkit_shouldLoadAsEmptyDocument
679{
680    return [[self _web_originalDataAsString] _webkit_hasCaseInsensitivePrefix:@"about:"] || [self _web_isEmpty];
681}
682
683- (NSURL *)_web_URLWithLowercasedScheme
684{
685    CFRange range;
686    CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &range);
687    if (range.location == kCFNotFound) {
688        return self;
689    }
690
691    UInt8 static_buffer[URL_BYTES_BUFFER_LENGTH];
692    UInt8 *buffer = static_buffer;
693    CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, URL_BYTES_BUFFER_LENGTH);
694    if (bytesFilled == -1) {
695        CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0);
696        buffer = static_cast<UInt8 *>(malloc(bytesToAllocate));
697        bytesFilled = CFURLGetBytes((CFURLRef)self, buffer, bytesToAllocate);
698        ASSERT(bytesFilled == bytesToAllocate);
699    }
700
701    int i;
702    BOOL changed = NO;
703    for (i = 0; i < range.length; ++i) {
704        char c = buffer[range.location + i];
705        char lower = toASCIILower(c);
706        if (c != lower) {
707            buffer[range.location + i] = lower;
708            changed = YES;
709        }
710    }
711
712    NSURL *result = changed
713        ? (NSURL *)WebCFAutorelease(CFURLCreateAbsoluteURLWithBytes(NULL, buffer, bytesFilled, kCFStringEncodingUTF8, nil, YES))
714        : (NSURL *)self;
715
716    if (buffer != static_buffer) {
717        free(buffer);
718    }
719
720    return result;
721}
722
723
724-(BOOL)_web_hasQuestionMarkOnlyQueryString
725{
726    CFRange rangeWithSeparators;
727    CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentQuery, &rangeWithSeparators);
728    if (rangeWithSeparators.location != kCFNotFound && rangeWithSeparators.length == 1) {
729        return YES;
730    }
731    return NO;
732}
733
734-(NSData *)_web_schemeSeparatorWithoutColon
735{
736    NSData *result = nil;
737    CFRange rangeWithSeparators;
738    CFRange range = CFURLGetByteRangeForComponent((CFURLRef)self, kCFURLComponentScheme, &rangeWithSeparators);
739    if (rangeWithSeparators.location != kCFNotFound) {
740        NSString *absoluteString = [self absoluteString];
741        NSRange separatorsRange = NSMakeRange(range.location + range.length + 1, rangeWithSeparators.length - range.length - 1);
742        if (separatorsRange.location + separatorsRange.length <= [absoluteString length]) {
743            NSString *slashes = [absoluteString substringWithRange:separatorsRange];
744            result = [slashes dataUsingEncoding:NSISOLatin1StringEncoding];
745        }
746    }
747    return result;
748}
749
750#define completeURL (CFURLComponentType)-1
751
752-(NSData *)_web_dataForURLComponentType:(CFURLComponentType)componentType
753{
754    static int URLComponentTypeBufferLength = 2048;
755
756    UInt8 staticAllBytesBuffer[URLComponentTypeBufferLength];
757    UInt8 *allBytesBuffer = staticAllBytesBuffer;
758
759    CFIndex bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, URLComponentTypeBufferLength);
760    if (bytesFilled == -1) {
761        CFIndex bytesToAllocate = CFURLGetBytes((CFURLRef)self, NULL, 0);
762        allBytesBuffer = static_cast<UInt8 *>(malloc(bytesToAllocate));
763        bytesFilled = CFURLGetBytes((CFURLRef)self, allBytesBuffer, bytesToAllocate);
764    }
765
766    CFRange range;
767    if (componentType != completeURL) {
768        range = CFURLGetByteRangeForComponent((CFURLRef)self, componentType, NULL);
769        if (range.location == kCFNotFound) {
770            return nil;
771        }
772    }
773    else {
774        range.location = 0;
775        range.length = bytesFilled;
776    }
777
778    NSData *componentData = [NSData dataWithBytes:allBytesBuffer + range.location length:range.length];
779
780    const unsigned char *bytes = static_cast<const unsigned char *>([componentData bytes]);
781    NSMutableData *resultData = [NSMutableData data];
782    // NOTE: add leading '?' to query strings non-zero length query strings.
783    // NOTE: retain question-mark only query strings.
784    if (componentType == kCFURLComponentQuery) {
785        if (range.length > 0 || [self _web_hasQuestionMarkOnlyQueryString]) {
786            [resultData appendBytes:"?" length:1];
787        }
788    }
789    int i;
790    for (i = 0; i < range.length; i++) {
791        unsigned char c = bytes[i];
792        if (c <= 0x20 || c >= 0x7f) {
793            char escaped[3];
794            escaped[0] = '%';
795            escaped[1] = hexDigit(c >> 4);
796            escaped[2] = hexDigit(c & 0xf);
797            [resultData appendBytes:escaped length:3];
798        }
799        else {
800            char b[1];
801            b[0] = c;
802            [resultData appendBytes:b length:1];
803        }
804    }
805
806    if (staticAllBytesBuffer != allBytesBuffer) {
807        free(allBytesBuffer);
808    }
809
810    return resultData;
811}
812
813-(NSData *)_web_schemeData
814{
815    return [self _web_dataForURLComponentType:kCFURLComponentScheme];
816}
817
818-(NSData *)_web_hostData
819{
820    NSData *result = [self _web_dataForURLComponentType:kCFURLComponentHost];
821    NSData *scheme = [self _web_schemeData];
822    // Take off localhost for file
823    if ([scheme _web_isCaseInsensitiveEqualToCString:"file"]) {
824        return ([result _web_isCaseInsensitiveEqualToCString:"localhost"]) ? nil : result;
825    }
826    return result;
827}
828
829- (NSString *)_web_hostString
830{
831    NSData *data = [self _web_hostData];
832    if (!data) {
833        data = [NSData data];
834    }
835    return [[[NSString alloc] initWithData:[self _web_hostData] encoding:NSUTF8StringEncoding] autorelease];
836}
837
838- (NSString *)_webkit_suggestedFilenameWithMIMEType:(NSString *)MIMEType
839{
840    return suggestedFilenameWithMIMEType(self, MIMEType);
841}
842
843@end
844
845@implementation NSString (WebNSURLExtras)
846
847- (BOOL)_web_isUserVisibleURL
848{
849    BOOL valid = YES;
850    // get buffer
851
852    char static_buffer[1024];
853    const char *p;
854    BOOL success = CFStringGetCString((CFStringRef)self, static_buffer, 1023, kCFStringEncodingUTF8);
855    if (success) {
856        p = static_buffer;
857    } else {
858        p = [self UTF8String];
859    }
860
861    int length = strlen(p);
862
863    // check for characters <= 0x20 or >=0x7f, %-escape sequences of %7f, and xn--, these
864    // are the things that will lead _web_userVisibleString to actually change things.
865    int i;
866    for (i = 0; i < length; i++) {
867        unsigned char c = p[i];
868        // escape control characters, space, and delete
869        if (c <= 0x20 || c == 0x7f) {
870            valid = NO;
871            break;
872        } else if (c == '%' && (i + 1 < length && isHexDigit(p[i + 1])) && i + 2 < length && isHexDigit(p[i + 2])) {
873            unsigned char u = (hexDigitValue(p[i + 1]) << 4) | hexDigitValue(p[i + 2]);
874            if (u > 0x7f) {
875                valid = NO;
876                break;
877            }
878            i += 2;
879        } else {
880            // Check for "xn--" in an efficient, non-case-sensitive, way.
881            if (c == '-' && i >= 3 && (p[i - 3] | 0x20) == 'x' && (p[i - 2] | 0x20) == 'n' && p[i - 1] == '-') {
882                valid = NO;
883                break;
884            }
885        }
886    }
887
888    return valid;
889}
890
891
892- (BOOL)_webkit_isJavaScriptURL
893{
894    return [self _webkit_hasCaseInsensitivePrefix:@"javascript:"];
895}
896
897- (BOOL)_webkit_isFileURL
898{
899    return [self rangeOfString:@"file:" options:(NSCaseInsensitiveSearch | NSAnchoredSearch)].location != NSNotFound;
900}
901
902- (NSString *)_webkit_stringByReplacingValidPercentEscapes
903{
904    return decodeURLEscapeSequences(self);
905}
906
907- (NSString *)_webkit_scriptIfJavaScriptURL
908{
909    if (![self _webkit_isJavaScriptURL]) {
910        return nil;
911    }
912    return [[self substringFromIndex:11] _webkit_stringByReplacingValidPercentEscapes];
913}
914
915- (BOOL)_webkit_isFTPDirectoryURL
916{
917    int length = [self length];
918    if (length < 5) {  // 5 is length of "ftp:/"
919        return NO;
920    }
921    unichar lastChar = [self characterAtIndex:length - 1];
922    return lastChar == '/' && [self _webkit_hasCaseInsensitivePrefix:@"ftp:"];
923}
924
925
926static BOOL readIDNScriptWhiteListFile(NSString *filename)
927{
928    if (!filename) {
929        return NO;
930    }
931    FILE *file = fopen([filename fileSystemRepresentation], "r");
932    if (file == NULL) {
933        return NO;
934    }
935
936    // Read a word at a time.
937    // Allow comments, starting with # character to the end of the line.
938    while (1) {
939        // Skip a comment if present.
940        int result = fscanf(file, " #%*[^\n\r]%*[\n\r]");
941        if (result == EOF) {
942            break;
943        }
944
945        // Read a script name if present.
946        char word[33];
947        result = fscanf(file, " %32[^# \t\n\r]%*[^# \t\n\r] ", word);
948        if (result == EOF) {
949            break;
950        }
951        if (result == 1) {
952            // Got a word, map to script code and put it into the array.
953            int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
954            if (script >= 0 && script < USCRIPT_CODE_LIMIT) {
955                size_t index = script / 32;
956                uint32_t mask = 1 << (script % 32);
957                IDNScriptWhiteList[index] |= mask;
958            }
959        }
960    }
961    fclose(file);
962    return YES;
963}
964
965static void readIDNScriptWhiteList(void)
966{
967    // Read white list from library.
968    NSArray *dirs = NSSearchPathForDirectoriesInDomains(NSLibraryDirectory, NSAllDomainsMask, YES);
969    int i, numDirs = [dirs count];
970    for (i = 0; i < numDirs; i++) {
971        NSString *dir = [dirs objectAtIndex:i];
972        if (readIDNScriptWhiteListFile([dir stringByAppendingPathComponent:@"IDNScriptWhiteList.txt"])) {
973            return;
974        }
975    }
976
977    // Fall back on white list inside bundle.
978    NSBundle *bundle = [NSBundle bundleWithIdentifier:@"com.apple.WebKit"];
979    readIDNScriptWhiteListFile([bundle pathForResource:@"IDNScriptWhiteList" ofType:@"txt"]);
980}
981
982static BOOL allCharactersInIDNScriptWhiteList(const UChar *buffer, int32_t length)
983{
984    pthread_once(&IDNScriptWhiteListFileRead, readIDNScriptWhiteList);
985
986    int32_t i = 0;
987    while (i < length) {
988        UChar32 c;
989        U16_NEXT(buffer, i, length, c)
990        UErrorCode error = U_ZERO_ERROR;
991        UScriptCode script = uscript_getScript(c, &error);
992        if (error != U_ZERO_ERROR) {
993            LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
994            return NO;
995        }
996        if (script < 0) {
997            LOG_ERROR("got negative number for script code from ICU: %d", script);
998            return NO;
999        }
1000        if (script >= USCRIPT_CODE_LIMIT) {
1001            return NO;
1002        }
1003        size_t index = script / 32;
1004        uint32_t mask = 1 << (script % 32);
1005        if (!(IDNScriptWhiteList[index] & mask)) {
1006            return NO;
1007        }
1008
1009        if (isLookalikeCharacter(c))
1010            return NO;
1011    }
1012    return YES;
1013}
1014
1015// Return value of nil means no mapping is necessary.
1016// If makeString is NO, then return value is either nil or self to indicate mapping is necessary.
1017// If makeString is YES, then return value is either nil or the mapped string.
1018- (NSString *)_web_mapHostNameWithRange:(NSRange)range encode:(BOOL)encode makeString:(BOOL)makeString
1019{
1020    if (range.length > HOST_NAME_BUFFER_LENGTH) {
1021        return nil;
1022    }
1023
1024    if ([self length] == 0)
1025        return nil;
1026
1027    UChar sourceBuffer[HOST_NAME_BUFFER_LENGTH];
1028    UChar destinationBuffer[HOST_NAME_BUFFER_LENGTH];
1029
1030    NSString *string = self;
1031    if (encode && [self rangeOfString:@"%" options:NSLiteralSearch range:range].location != NSNotFound) {
1032        NSString *substring = [self substringWithRange:range];
1033        substring = WebCFAutorelease(CFURLCreateStringByReplacingPercentEscapes(NULL, (CFStringRef)substring, CFSTR("")));
1034        if (substring != nil) {
1035            string = substring;
1036            range = NSMakeRange(0, [string length]);
1037        }
1038    }
1039
1040    int length = range.length;
1041    [string getCharacters:sourceBuffer range:range];
1042
1043    UErrorCode error = U_ZERO_ERROR;
1044    int32_t numCharactersConverted = (encode ? uidna_IDNToASCII : uidna_IDNToUnicode)
1045        (sourceBuffer, length, destinationBuffer, HOST_NAME_BUFFER_LENGTH, UIDNA_ALLOW_UNASSIGNED, NULL, &error);
1046    if (error != U_ZERO_ERROR) {
1047        return nil;
1048    }
1049    if (numCharactersConverted == length && memcmp(sourceBuffer, destinationBuffer, length * sizeof(UChar)) == 0) {
1050        return nil;
1051    }
1052    if (!encode && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted)) {
1053        return nil;
1054    }
1055    return makeString ? (NSString *)[NSString stringWithCharacters:destinationBuffer length:numCharactersConverted] : (NSString *)self;
1056}
1057
1058- (BOOL)_web_hostNameNeedsDecodingWithRange:(NSRange)range
1059{
1060    return [self _web_mapHostNameWithRange:range encode:NO makeString:NO] != nil;
1061}
1062
1063- (BOOL)_web_hostNameNeedsEncodingWithRange:(NSRange)range
1064{
1065    return [self _web_mapHostNameWithRange:range encode:YES makeString:NO] != nil;
1066}
1067
1068- (NSString *)_web_decodeHostNameWithRange:(NSRange)range
1069{
1070    return [self _web_mapHostNameWithRange:range encode:NO makeString:YES];
1071}
1072
1073- (NSString *)_web_encodeHostNameWithRange:(NSRange)range
1074{
1075    return [self _web_mapHostNameWithRange:range encode:YES makeString:YES];
1076}
1077
1078- (NSString *)_web_decodeHostName
1079{
1080    NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:NO makeString:YES];
1081    return name == nil ? self : name;
1082}
1083
1084- (NSString *)_web_encodeHostName
1085{
1086    NSString *name = [self _web_mapHostNameWithRange:NSMakeRange(0, [self length]) encode:YES makeString:YES];
1087    return name == nil ? self : name;
1088}
1089
1090-(NSRange)_webkit_rangeOfURLScheme
1091{
1092    NSRange colon = [self rangeOfString:@":"];
1093    if (colon.location != NSNotFound && colon.location > 0) {
1094        NSRange scheme = {0, colon.location};
1095        static NSCharacterSet *InverseSchemeCharacterSet = nil;
1096        if (!InverseSchemeCharacterSet) {
1097            /*
1098             This stuff is very expensive.  10-15 msec on a 2x1.2GHz.  If not cached it swamps
1099             everything else when adding items to the autocomplete DB.  Makes me wonder if we
1100             even need to enforce the character set here.
1101            */
1102            NSString *acceptableCharacters = @"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
1103            InverseSchemeCharacterSet = [[[NSCharacterSet characterSetWithCharactersInString:acceptableCharacters] invertedSet] retain];
1104        }
1105        NSRange illegals = [self rangeOfCharacterFromSet:InverseSchemeCharacterSet options:0 range:scheme];
1106        if (illegals.location == NSNotFound)
1107            return scheme;
1108    }
1109    return NSMakeRange(NSNotFound, 0);
1110}
1111
1112-(BOOL)_webkit_looksLikeAbsoluteURL
1113{
1114    // Trim whitespace because _web_URLWithString allows whitespace.
1115    return [[self _webkit_stringByTrimmingWhitespace] _webkit_rangeOfURLScheme].location != NSNotFound;
1116}
1117
1118- (NSString *)_webkit_URLFragment
1119{
1120    NSRange fragmentRange;
1121
1122    fragmentRange = [self rangeOfString:@"#" options:NSLiteralSearch];
1123    if (fragmentRange.location == NSNotFound)
1124        return nil;
1125    return [self substringFromIndex:fragmentRange.location + 1];
1126}
1127
1128@end
1129