• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <stdlib.h>
6 
7 #include "base/logging.h"
8 #include "base/string_util.h"
9 #include "net/base/net_util.h"
10 #include "net/tools/dump_cache/url_to_filename_encoder.h"
11 
12 using std::string;
13 
14 namespace {
15 
16 // Returns 1 if buf is prefixed by "num_digits" of hex digits
17 // Teturns 0 otherwise.
18 // The function checks for '\0' for string termination.
HexDigitsPrefix(const char * buf,int num_digits)19 int HexDigitsPrefix(const char* buf, int num_digits) {
20   for (int i = 0; i < num_digits; i++) {
21     if (!IsHexDigit(buf[i]))
22       return 0;  // This also detects end of string as '\0' is not xdigit.
23   }
24   return 1;
25 }
26 
27 #ifdef WIN32
28 #define strtoull _strtoui64
29 #endif
30 
31 // A simple parser for long long values. Returns the parsed value if a
32 // valid integer is found; else returns deflt
33 // UInt64 and Int64 cannot handle decimal numbers with leading 0s.
ParseLeadingHex64Value(const char * str,uint64 deflt)34 uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
35   char *error = NULL;
36   const uint64 value = strtoull(str, &error, 16);
37   return (error == str) ? deflt : value;
38 }
39 
40 }
41 
42 namespace net {
43 
44 // The escape character choice is made here -- all code and tests in this
45 // directory are based off of this constant.  However, our testdata
46 // has tons of dependencies on this, so it cannot be changed without
47 // re-running those tests and fixing them.
48 const char UrlToFilenameEncoder::kEscapeChar = ',';
49 const char UrlToFilenameEncoder::kTruncationChar = '-';
50 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
51 
AppendSegment(string * segment,string * dest)52 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
53   CHECK(!segment->empty());
54   if ((*segment == ".") || (*segment == "..")) {
55     dest->append(1, kEscapeChar);
56     dest->append(*segment);
57     segment->clear();
58   } else {
59     size_t segment_size = segment->size();
60     if (segment_size > kMaximumSubdirectoryLength) {
61       // We need to inject ",-" at the end of the segment to signify that
62       // we are inserting an artificial '/'.  This means we have to chop
63       // off at least two characters to make room.
64       segment_size = kMaximumSubdirectoryLength - 2;
65 
66       // But we don't want to break up an escape sequence that happens to lie at
67       // the end.  Escape sequences are at most 2 characters.
68       if ((*segment)[segment_size - 1] == kEscapeChar) {
69         segment_size -= 1;
70       } else if ((*segment)[segment_size - 2] == kEscapeChar) {
71         segment_size -= 2;
72       }
73       dest->append(segment->data(), segment_size);
74       dest->append(1, kEscapeChar);
75       dest->append(1, kTruncationChar);
76       segment->erase(0, segment_size);
77 
78       // At this point, if we had segment_size=3, and segment="abcd",
79       // then after this erase, we will have written "abc,-" and set segment="d"
80     } else {
81       dest->append(*segment);
82       segment->clear();
83     }
84   }
85 }
86 
EncodeSegment(const string & filename_prefix,const string & escaped_ending,char dir_separator,string * encoded_filename)87 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
88                                          const string& escaped_ending,
89                                          char dir_separator,
90                                          string* encoded_filename) {
91   string filename_ending = UrlUtilities::Unescape(escaped_ending);
92 
93   char encoded[3];
94   int encoded_len;
95   string segment;
96 
97   // TODO(jmarantz): This code would be a bit simpler if we disallowed
98   // Instaweb allowing filename_prefix to not end in "/".  We could
99   // then change the is routine to just take one input string.
100   size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
101   if (start_of_segment == string::npos) {
102     segment = filename_prefix;
103   } else {
104     segment = filename_prefix.substr(start_of_segment + 1);
105     *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
106   }
107 
108   size_t index = 0;
109   // Special case the first / to avoid adding a leading kEscapeChar.
110   if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
111     encoded_filename->append(segment);
112     segment.clear();
113     encoded_filename->append(1, dir_separator);
114     ++index;
115   }
116 
117   for (; index < filename_ending.length(); ++index) {
118     unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
119 
120     // Note: instead of outputing an empty segment, we let the second slash
121     // be escaped below.
122     if ((ch == dir_separator) && !segment.empty()) {
123       AppendSegment(&segment, encoded_filename);
124       encoded_filename->append(1, dir_separator);
125       segment.clear();
126     } else {
127       // After removing unsafe chars the only safe ones are _.=+- and alphanums.
128       if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
129           (ch == '-') || (('0' <= ch) && (ch <= '9')) ||
130           (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
131         encoded[0] = ch;
132         encoded_len = 1;
133       } else {
134         encoded[0] = kEscapeChar;
135         encoded[1] = ch / 16;
136         encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
137         encoded[2] = ch % 16;
138         encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
139         encoded_len = 3;
140       }
141       segment.append(encoded, encoded_len);
142 
143       // If segment is too big, we must chop it into chunks.
144       if (segment.size() > kMaximumSubdirectoryLength) {
145         AppendSegment(&segment, encoded_filename);
146         encoded_filename->append(1, dir_separator);
147       }
148     }
149   }
150 
151   // Append "," to the leaf filename so the leaf can also be a branch., e.g.
152   // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
153   // /a/b/c/d".  So we will rename the "d" here to "d,".  If doing that pushed
154   // us over the 128 char limit, then we will need to append "/" and the
155   // remaining chars.
156   segment += kEscapeChar;
157   AppendSegment(&segment, encoded_filename);
158   if (!segment.empty()) {
159     // The last overflow segment is special, because we appended in
160     // kEscapeChar above.  We won't need to check it again for size
161     // or further escaping.
162     encoded_filename->append(1, dir_separator);
163     encoded_filename->append(segment);
164   }
165 }
166 
167 // Note: this decoder is not the exact inverse of the EncodeSegment above,
168 // because it does not take into account a prefix.
Decode(const string & encoded_filename,char dir_separator,string * decoded_url)169 bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
170                                   char dir_separator,
171                                   string* decoded_url) {
172   enum State {
173     kStart,
174     kEscape,
175     kFirstDigit,
176     kTruncate,
177     kEscapeDot
178   };
179   State state = kStart;
180   int char_code = 0;
181   char hex_buffer[3];
182   hex_buffer[2] = '\0';
183   for (size_t i = 0; i < encoded_filename.size(); ++i) {
184     char ch = encoded_filename[i];
185     switch (state) {
186       case kStart:
187         if (ch == kEscapeChar) {
188           state = kEscape;
189         } else if (ch == dir_separator) {
190           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
191         } else {
192           decoded_url->append(1, ch);
193         }
194         break;
195       case kEscape:
196         if (HexDigitsPrefix(&ch, 1) == 1) {
197           hex_buffer[0] = ch;
198           state = kFirstDigit;
199         } else if (ch == kTruncationChar) {
200           state = kTruncate;
201         } else if (ch == '.') {
202           decoded_url->append(1, '.');
203           state = kEscapeDot;  // Look for at most one more dot.
204         } else if (ch == dir_separator) {
205           // Consider url "//x".  This was once encoded to "/,/x,".
206           // This code is what skips the first Escape.
207           decoded_url->append(1, '/');  // URLs only use '/' not '\\'
208           state = kStart;
209         } else {
210           return false;
211         }
212         break;
213       case kFirstDigit:
214         if (HexDigitsPrefix(&ch, 1) == 1) {
215           hex_buffer[1] = ch;
216           uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
217           decoded_url->append(1, static_cast<char>(hex_value));
218           char_code = 0;
219           state = kStart;
220         } else {
221           return false;
222         }
223         break;
224       case kTruncate:
225         if (ch == dir_separator) {
226           // Skip this separator, it was only put in to break up long
227           // path segments, but is not part of the URL.
228           state = kStart;
229         } else {
230           return false;
231         }
232         break;
233       case kEscapeDot:
234         decoded_url->append(1, ch);
235         state = kStart;
236         break;
237     }
238   }
239 
240   // All legal encoded filenames end in kEscapeChar.
241   return (state == kEscape);
242 }
243 
244 // Escape the given input |path| and chop any individual components
245 // of the path which are greater than kMaximumSubdirectoryLength characters
246 // into two chunks.
247 //
248 // This legacy version has several issues with aliasing of different URLs,
249 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode
250 // the filenames back into URLs.
251 //
252 // But there is a large body of slurped data which depends on this format,
253 // so leave it as the default for spdy_in_mem_edsm_server.
LegacyEscape(const string & path)254 string UrlToFilenameEncoder::LegacyEscape(const string& path) {
255   string output;
256 
257   // Note:  We also chop paths into medium sized 'chunks'.
258   //        This is due to the incompetence of the windows
259   //        filesystem, which still hasn't figured out how
260   //        to deal with long filenames.
261   int last_slash = 0;
262   for (size_t index = 0; index < path.length(); index++) {
263     char ch = path[index];
264     if (ch == 0x5C)
265       last_slash = index;
266     if ((ch == 0x2D) ||                    // hyphen
267         (ch == 0x5C) || (ch == 0x5F) ||    // backslash, underscore
268         ((0x30 <= ch) && (ch <= 0x39)) ||  // Digits [0-9]
269         ((0x41 <= ch) && (ch <= 0x5A)) ||  // Uppercase [A-Z]
270         ((0x61 <= ch) && (ch <= 0x7A))) {  // Lowercase [a-z]
271       output.append(&path[index], 1);
272     } else {
273       char encoded[3];
274       encoded[0] = 'x';
275       encoded[1] = ch / 16;
276       encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
277       encoded[2] = ch % 16;
278       encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
279       output.append(encoded, 3);
280     }
281     if (index - last_slash > kMaximumSubdirectoryLength) {
282 #ifdef WIN32
283       char slash = '\\';
284 #else
285       char slash = '/';
286 #endif
287       output.append(&slash, 1);
288       last_slash = index;
289     }
290   }
291   return output;
292 }
293 
294 }  // namespace net
295