1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
18 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "config.h"
27
28 #undef WEBKIT_IMPLEMENTATION
29 #undef LOG
30
31 #include "base/utf_string_conversions.h"
32 #include "net/base/escape.h"
33 #include "PhoneEmailDetector.h"
34 #include "Settings.h"
35 #include "WebString.h"
36
37 #define LOG_TAG "PhoneNumberDetector"
38 #include <cutils/log.h>
39
40 #define PHONE_PATTERN "(200) /-.\\ 100 -. 0000"
41
42 static const char kTelSchemaPrefix[] = "tel:";
43 static const char kEmailSchemaPrefix[] = "mailto:";
44
45 void FindReset(FindState* state);
46 void FindResetNumber(FindState* state);
47 FoundState FindPartialNumber(const UChar* chars, unsigned length,
48 FindState* s);
49 struct FindState;
50
51 static FoundState FindPartialEMail(const UChar* , unsigned length, FindState* );
52 static bool IsDomainChar(UChar ch);
53 static bool IsMailboxChar(UChar ch);
54
PhoneEmailDetector()55 PhoneEmailDetector::PhoneEmailDetector()
56 : m_foundResult(FOUND_NONE)
57 {
58 }
59
IsEnabled(const WebKit::WebHitTestInfo & hit_test)60 bool PhoneEmailDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test)
61 {
62 WebCore::Settings* settings = GetSettings(hit_test);
63 if (!settings)
64 return false;
65 m_isPhoneDetectionEnabled = settings->formatDetectionTelephone();
66 m_isEmailDetectionEnabled = settings->formatDetectionEmail();
67 return m_isEmailDetectionEnabled || m_isPhoneDetectionEnabled;
68 }
69
FindContent(const string16::const_iterator & begin,const string16::const_iterator & end,size_t * start_pos,size_t * end_pos)70 bool PhoneEmailDetector::FindContent(const string16::const_iterator& begin,
71 const string16::const_iterator& end,
72 size_t* start_pos,
73 size_t* end_pos)
74 {
75 FindReset(&m_findState);
76 m_foundResult = FOUND_NONE;
77 if (m_isPhoneDetectionEnabled)
78 m_foundResult = FindPartialNumber(begin, end - begin, &m_findState);
79 if (m_foundResult == FOUND_COMPLETE)
80 m_prefix = kTelSchemaPrefix;
81 else {
82 FindReset(&m_findState);
83 if (m_isEmailDetectionEnabled)
84 m_foundResult = FindPartialEMail(begin, end - begin, &m_findState);
85 m_prefix = kEmailSchemaPrefix;
86 }
87 *start_pos = m_findState.mStartResult;
88 *end_pos = m_findState.mEndResult;
89 return m_foundResult == FOUND_COMPLETE;
90 }
91
GetContentText(const WebKit::WebRange & range)92 std::string PhoneEmailDetector::GetContentText(const WebKit::WebRange& range)
93 {
94 if (m_foundResult == FOUND_COMPLETE) {
95 if (m_prefix == kTelSchemaPrefix)
96 return UTF16ToUTF8(m_findState.mStore);
97 else
98 return UTF16ToUTF8(range.toPlainText());
99 }
100 return std::string();
101 }
102
GetIntentURL(const std::string & content_text)103 GURL PhoneEmailDetector::GetIntentURL(const std::string& content_text)
104 {
105 return GURL(m_prefix +
106 EscapeQueryParamValue(content_text, true));
107 }
108
FindReset(FindState * state)109 void FindReset(FindState* state)
110 {
111 memset(state, 0, sizeof(FindState));
112 state->mCurrent = ' ';
113 FindResetNumber(state);
114 }
115
FindResetNumber(FindState * state)116 void FindResetNumber(FindState* state)
117 {
118 state->mOpenParen = false;
119 state->mPattern = (char*) PHONE_PATTERN;
120 state->mStorePtr = state->mStore;
121 }
122
FindPartialNumber(const UChar * chars,unsigned length,FindState * s)123 FoundState FindPartialNumber(const UChar* chars, unsigned length,
124 FindState* s)
125 {
126 char* pattern = s->mPattern;
127 UChar* store = s->mStorePtr;
128 const UChar* start = chars;
129 const UChar* end = chars + length;
130 const UChar* lastDigit = 0;
131 string16 search16(chars, length);
132 std::string searchSpace = UTF16ToUTF8(search16);
133 do {
134 bool initialized = s->mInitialized;
135 while (chars < end) {
136 if (initialized == false) {
137 s->mBackTwo = s->mBackOne;
138 s->mBackOne = s->mCurrent;
139 }
140 UChar ch = s->mCurrent = *chars;
141 do {
142 char patternChar = *pattern;
143 switch (patternChar) {
144 case '2':
145 if (initialized == false) {
146 s->mStartResult = chars - start;
147 initialized = true;
148 }
149 case '0':
150 case '1':
151 if (ch < patternChar || ch > '9')
152 goto resetPattern;
153 *store++ = ch;
154 pattern++;
155 lastDigit = chars;
156 goto nextChar;
157 case '\0':
158 if (WTF::isASCIIDigit(ch) == false) {
159 *store = '\0';
160 goto checkMatch;
161 }
162 goto resetPattern;
163 case ' ':
164 if (ch == patternChar)
165 goto nextChar;
166 break;
167 case '(':
168 if (ch == patternChar) {
169 s->mStartResult = chars - start;
170 initialized = true;
171 s->mOpenParen = true;
172 }
173 goto commonPunctuation;
174 case ')':
175 if ((ch == patternChar) ^ s->mOpenParen)
176 goto resetPattern;
177 default:
178 commonPunctuation:
179 if (ch == patternChar) {
180 pattern++;
181 goto nextChar;
182 }
183 }
184 } while (++pattern); // never false
185 nextChar:
186 chars++;
187 }
188 break;
189 resetPattern:
190 if (s->mContinuationNode)
191 return FOUND_NONE;
192 FindResetNumber(s);
193 pattern = s->mPattern;
194 store = s->mStorePtr;
195 } while (++chars < end);
196 checkMatch:
197 if (WTF::isASCIIDigit(s->mBackOne != '1' ? s->mBackOne : s->mBackTwo)) {
198 return FOUND_NONE;
199 }
200 *store = '\0';
201 s->mStorePtr = store;
202 s->mPattern = pattern;
203 s->mEndResult = lastDigit - start + 1;
204 char pState = pattern[0];
205 return pState == '\0' ? FOUND_COMPLETE : pState == '(' || (WTF::isASCIIDigit(pState) && WTF::isASCIIDigit(pattern[-1])) ?
206 FOUND_NONE : FOUND_PARTIAL;
207 }
208
FindPartialEMail(const UChar * chars,unsigned length,FindState * s)209 FoundState FindPartialEMail(const UChar* chars, unsigned length,
210 FindState* s)
211 {
212 // the following tables were generated by tests/browser/focusNavigation/BrowserDebug.cpp
213 // hand-edit at your own risk
214 static const int domainTwoLetter[] = {
215 0x02df797c, // a followed by: [cdefgilmnoqrstuwxz]
216 0x036e73fb, // b followed by: [abdefghijmnorstvwyz]
217 0x03b67ded, // c followed by: [acdfghiklmnorsuvxyz]
218 0x02005610, // d followed by: [ejkmoz]
219 0x001e00d4, // e followed by: [ceghrstu]
220 0x00025700, // f followed by: [ijkmor]
221 0x015fb9fb, // g followed by: [abdefghilmnpqrstuwy]
222 0x001a3400, // h followed by: [kmnrtu]
223 0x000f7818, // i followed by: [delmnoqrst]
224 0x0000d010, // j followed by: [emop]
225 0x0342b1d0, // k followed by: [eghimnprwyz]
226 0x013e0507, // l followed by: [abcikrstuvy]
227 0x03fffccd, // m followed by: [acdghklmnopqrstuvwxyz]
228 0x0212c975, // n followed by: [acefgilopruz]
229 0x00001000, // o followed by: [m]
230 0x014e3cf1, // p followed by: [aefghklmnrstwy]
231 0x00000001, // q followed by: [a]
232 0x00504010, // r followed by: [eouw]
233 0x032a7fdf, // s followed by: [abcdeghijklmnortvyz]
234 0x026afeec, // t followed by: [cdfghjklmnoprtvwz]
235 0x03041441, // u followed by: [agkmsyz]
236 0x00102155, // v followed by: [aceginu]
237 0x00040020, // w followed by: [fs]
238 0x00000000, // x
239 0x00180010, // y followed by: [etu]
240 0x00401001, // z followed by: [amw]
241 };
242
243 static char const* const longDomainNames[] = {
244 "\x03" "ero" "\x03" "rpa", // aero, arpa
245 "\x02" "iz", // biz
246 "\x02" "at" "\x02" "om" "\x03" "oop", // cat, com, coop
247 NULL, // d
248 "\x02" "du", // edu
249 NULL, // f
250 "\x02" "ov", // gov
251 NULL, // h
252 "\x03" "nfo" "\x02" "nt", // info, int
253 "\x03" "obs", // jobs
254 NULL, // k
255 NULL, // l
256 "\x02" "il" "\x03" "obi" "\x05" "useum", // mil, mobi, museum
257 "\x03" "ame" "\x02" "et", // name, net
258 "\x02" "rg", // , org
259 "\x02" "ro", // pro
260 NULL, // q
261 NULL, // r
262 NULL, // s
263 "\x05" "ravel", // travel
264 NULL, // u
265 NULL, // v
266 NULL, // w
267 NULL, // x
268 NULL, // y
269 NULL, // z
270 };
271
272 const UChar* start = chars;
273 const UChar* end = chars + length;
274 while (chars < end) {
275 UChar ch = *chars++;
276 if (ch != '@')
277 continue;
278 const UChar* atLocation = chars - 1;
279 // search for domain
280 ch = *chars++ | 0x20; // convert uppercase to lower
281 if (ch < 'a' || ch > 'z')
282 continue;
283 while (chars < end) {
284 ch = *chars++;
285 if (IsDomainChar(ch) == false)
286 goto nextAt;
287 if (ch != '.')
288 continue;
289 UChar firstLetter = *chars++ | 0x20; // first letter of the domain
290 if (chars >= end)
291 return FOUND_NONE; // only one letter; must be at least two
292 firstLetter -= 'a';
293 if (firstLetter > 'z' - 'a')
294 continue; // non-letter followed '.'
295 int secondLetterMask = domainTwoLetter[firstLetter];
296 ch = *chars | 0x20; // second letter of the domain
297 ch -= 'a';
298 if (ch >= 'z' - 'a')
299 continue;
300 bool secondMatch = (secondLetterMask & 1 << ch) != 0;
301 const char* wordMatch = longDomainNames[firstLetter];
302 int wordIndex = 0;
303 while (wordMatch != NULL) {
304 int len = *wordMatch++;
305 char match;
306 do {
307 match = wordMatch[wordIndex];
308 if (match < 0x20)
309 goto foundDomainStart;
310 if (chars[wordIndex] != match)
311 break;
312 wordIndex++;
313 } while (true);
314 wordMatch += len;
315 if (*wordMatch == '\0')
316 break;
317 wordIndex = 0;
318 }
319 if (secondMatch) {
320 wordIndex = 1;
321 foundDomainStart:
322 chars += wordIndex;
323 if (chars < end) {
324 ch = *chars;
325 if (ch != '.') {
326 if (IsDomainChar(ch))
327 goto nextDot;
328 } else if (chars + 1 < end && IsDomainChar(chars[1]))
329 goto nextDot;
330 }
331 // found domain. Search backwards from '@' for beginning of email address
332 s->mEndResult = chars - start;
333 chars = atLocation;
334 if (chars <= start)
335 goto nextAt;
336 ch = *--chars;
337 if (ch == '.')
338 goto nextAt; // mailbox can't end in period
339 do {
340 if (IsMailboxChar(ch) == false) {
341 chars++;
342 break;
343 }
344 if (chars == start)
345 break;
346 ch = *--chars;
347 } while (true);
348 UChar firstChar = *chars;
349 if (firstChar == '.' || firstChar == '@') // mailbox can't start with period or be empty
350 goto nextAt;
351 s->mStartResult = chars - start;
352 return FOUND_COMPLETE;
353 }
354 nextDot:
355 ;
356 }
357 nextAt:
358 chars = atLocation + 1;
359 }
360 return FOUND_NONE;
361 }
362
IsDomainChar(UChar ch)363 bool IsDomainChar(UChar ch)
364 {
365 static const unsigned body[] = {0x03ff6000, 0x07fffffe, 0x07fffffe}; // 0-9 . - A-Z a-z
366 ch -= 0x20;
367 if (ch > 'z' - 0x20)
368 return false;
369 return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0;
370 }
371
IsMailboxChar(UChar ch)372 bool IsMailboxChar(UChar ch)
373 {
374 // According to http://en.wikipedia.org/wiki/Email_address
375 // ! # $ % & ' * + - . / 0-9 = ?
376 // A-Z ^ _
377 // ` a-z { | } ~
378 static const unsigned body[] = {0xa3ffecfa, 0xc7fffffe, 0x7fffffff};
379 ch -= 0x20;
380 if (ch > '~' - 0x20)
381 return false;
382 return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0;
383 }
384