• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1.  Redistributions of source code must retain the above copyright
9  *     notice, this list of conditions and the following disclaimer.
10  * 2.  Redistributions in binary form must reproduce the above copyright
11  *     notice, this list of conditions and the following disclaimer in the
12  *     documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
18  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "config.h"
27 
28 #undef WEBKIT_IMPLEMENTATION
29 #undef LOG
30 
31 #include "base/utf_string_conversions.h"
32 #include "net/base/escape.h"
33 #include "PhoneEmailDetector.h"
34 #include "Settings.h"
35 #include "WebString.h"
36 
37 #define LOG_TAG "PhoneNumberDetector"
38 #include <cutils/log.h>
39 
40 #define PHONE_PATTERN "(200) /-.\\ 100 -. 0000"
41 
42 static const char kTelSchemaPrefix[] = "tel:";
43 static const char kEmailSchemaPrefix[] = "mailto:";
44 
45 void FindReset(FindState* state);
46 void FindResetNumber(FindState* state);
47 FoundState FindPartialNumber(const UChar* chars, unsigned length,
48                              FindState* s);
49 struct FindState;
50 
51 static FoundState FindPartialEMail(const UChar* , unsigned length, FindState* );
52 static bool IsDomainChar(UChar ch);
53 static bool IsMailboxChar(UChar ch);
54 
PhoneEmailDetector()55 PhoneEmailDetector::PhoneEmailDetector()
56     : m_foundResult(FOUND_NONE)
57 {
58 }
59 
IsEnabled(const WebKit::WebHitTestInfo & hit_test)60 bool PhoneEmailDetector::IsEnabled(const WebKit::WebHitTestInfo& hit_test)
61 {
62     WebCore::Settings* settings = GetSettings(hit_test);
63     if (!settings)
64         return false;
65     m_isPhoneDetectionEnabled = settings->formatDetectionTelephone();
66     m_isEmailDetectionEnabled = settings->formatDetectionEmail();
67     return m_isEmailDetectionEnabled || m_isPhoneDetectionEnabled;
68 }
69 
FindContent(const string16::const_iterator & begin,const string16::const_iterator & end,size_t * start_pos,size_t * end_pos)70 bool PhoneEmailDetector::FindContent(const string16::const_iterator& begin,
71                              const string16::const_iterator& end,
72                              size_t* start_pos,
73                              size_t* end_pos)
74 {
75     FindReset(&m_findState);
76     m_foundResult = FOUND_NONE;
77     if (m_isPhoneDetectionEnabled)
78         m_foundResult = FindPartialNumber(begin, end - begin, &m_findState);
79     if (m_foundResult == FOUND_COMPLETE)
80         m_prefix = kTelSchemaPrefix;
81     else {
82         FindReset(&m_findState);
83         if (m_isEmailDetectionEnabled)
84             m_foundResult = FindPartialEMail(begin, end - begin, &m_findState);
85         m_prefix = kEmailSchemaPrefix;
86     }
87     *start_pos = m_findState.mStartResult;
88     *end_pos = m_findState.mEndResult;
89     return m_foundResult == FOUND_COMPLETE;
90 }
91 
GetContentText(const WebKit::WebRange & range)92 std::string PhoneEmailDetector::GetContentText(const WebKit::WebRange& range)
93 {
94     if (m_foundResult == FOUND_COMPLETE) {
95         if (m_prefix == kTelSchemaPrefix)
96             return UTF16ToUTF8(m_findState.mStore);
97         else
98             return UTF16ToUTF8(range.toPlainText());
99     }
100     return std::string();
101 }
102 
GetIntentURL(const std::string & content_text)103 GURL PhoneEmailDetector::GetIntentURL(const std::string& content_text)
104 {
105     return GURL(m_prefix +
106             EscapeQueryParamValue(content_text, true));
107 }
108 
FindReset(FindState * state)109 void FindReset(FindState* state)
110 {
111     memset(state, 0, sizeof(FindState));
112     state->mCurrent = ' ';
113     FindResetNumber(state);
114 }
115 
FindResetNumber(FindState * state)116 void FindResetNumber(FindState* state)
117 {
118     state->mOpenParen = false;
119     state->mPattern = (char*) PHONE_PATTERN;
120     state->mStorePtr = state->mStore;
121 }
122 
FindPartialNumber(const UChar * chars,unsigned length,FindState * s)123 FoundState FindPartialNumber(const UChar* chars, unsigned length,
124     FindState* s)
125 {
126     char* pattern = s->mPattern;
127     UChar* store = s->mStorePtr;
128     const UChar* start = chars;
129     const UChar* end = chars + length;
130     const UChar* lastDigit = 0;
131     string16 search16(chars, length);
132     std::string searchSpace = UTF16ToUTF8(search16);
133     do {
134         bool initialized = s->mInitialized;
135         while (chars < end) {
136             if (initialized == false) {
137                 s->mBackTwo = s->mBackOne;
138                 s->mBackOne = s->mCurrent;
139             }
140             UChar ch = s->mCurrent = *chars;
141             do {
142                 char patternChar = *pattern;
143                 switch (patternChar) {
144                     case '2':
145                         if (initialized == false) {
146                             s->mStartResult = chars - start;
147                             initialized = true;
148                         }
149                     case '0':
150                     case '1':
151                         if (ch < patternChar || ch > '9')
152                             goto resetPattern;
153                         *store++ = ch;
154                         pattern++;
155                         lastDigit = chars;
156                         goto nextChar;
157                     case '\0':
158                         if (WTF::isASCIIDigit(ch) == false) {
159                             *store = '\0';
160                             goto checkMatch;
161                         }
162                         goto resetPattern;
163                     case ' ':
164                         if (ch == patternChar)
165                             goto nextChar;
166                         break;
167                     case '(':
168                         if (ch == patternChar) {
169                             s->mStartResult = chars - start;
170                             initialized = true;
171                             s->mOpenParen = true;
172                         }
173                         goto commonPunctuation;
174                     case ')':
175                         if ((ch == patternChar) ^ s->mOpenParen)
176                             goto resetPattern;
177                     default:
178                     commonPunctuation:
179                         if (ch == patternChar) {
180                             pattern++;
181                             goto nextChar;
182                         }
183                 }
184             } while (++pattern); // never false
185     nextChar:
186             chars++;
187         }
188         break;
189 resetPattern:
190         if (s->mContinuationNode)
191             return FOUND_NONE;
192         FindResetNumber(s);
193         pattern = s->mPattern;
194         store = s->mStorePtr;
195     } while (++chars < end);
196 checkMatch:
197     if (WTF::isASCIIDigit(s->mBackOne != '1' ? s->mBackOne : s->mBackTwo)) {
198         return FOUND_NONE;
199     }
200     *store = '\0';
201     s->mStorePtr = store;
202     s->mPattern = pattern;
203     s->mEndResult = lastDigit - start + 1;
204     char pState = pattern[0];
205     return pState == '\0' ? FOUND_COMPLETE : pState == '(' || (WTF::isASCIIDigit(pState) && WTF::isASCIIDigit(pattern[-1])) ?
206         FOUND_NONE : FOUND_PARTIAL;
207 }
208 
FindPartialEMail(const UChar * chars,unsigned length,FindState * s)209 FoundState FindPartialEMail(const UChar* chars, unsigned length,
210     FindState* s)
211 {
212     // the following tables were generated by tests/browser/focusNavigation/BrowserDebug.cpp
213     // hand-edit at your own risk
214     static const int domainTwoLetter[] = {
215         0x02df797c,  // a followed by: [cdefgilmnoqrstuwxz]
216         0x036e73fb,  // b followed by: [abdefghijmnorstvwyz]
217         0x03b67ded,  // c followed by: [acdfghiklmnorsuvxyz]
218         0x02005610,  // d followed by: [ejkmoz]
219         0x001e00d4,  // e followed by: [ceghrstu]
220         0x00025700,  // f followed by: [ijkmor]
221         0x015fb9fb,  // g followed by: [abdefghilmnpqrstuwy]
222         0x001a3400,  // h followed by: [kmnrtu]
223         0x000f7818,  // i followed by: [delmnoqrst]
224         0x0000d010,  // j followed by: [emop]
225         0x0342b1d0,  // k followed by: [eghimnprwyz]
226         0x013e0507,  // l followed by: [abcikrstuvy]
227         0x03fffccd,  // m followed by: [acdghklmnopqrstuvwxyz]
228         0x0212c975,  // n followed by: [acefgilopruz]
229         0x00001000,  // o followed by: [m]
230         0x014e3cf1,  // p followed by: [aefghklmnrstwy]
231         0x00000001,  // q followed by: [a]
232         0x00504010,  // r followed by: [eouw]
233         0x032a7fdf,  // s followed by: [abcdeghijklmnortvyz]
234         0x026afeec,  // t followed by: [cdfghjklmnoprtvwz]
235         0x03041441,  // u followed by: [agkmsyz]
236         0x00102155,  // v followed by: [aceginu]
237         0x00040020,  // w followed by: [fs]
238         0x00000000,  // x
239         0x00180010,  // y followed by: [etu]
240         0x00401001,  // z followed by: [amw]
241     };
242 
243     static char const* const longDomainNames[] = {
244         "\x03" "ero" "\x03" "rpa",  // aero, arpa
245         "\x02" "iz",  // biz
246         "\x02" "at" "\x02" "om" "\x03" "oop",  // cat, com, coop
247         NULL,  // d
248         "\x02" "du",  // edu
249         NULL,  // f
250         "\x02" "ov",  // gov
251         NULL,  // h
252         "\x03" "nfo" "\x02" "nt",  // info, int
253         "\x03" "obs",  // jobs
254         NULL,  // k
255         NULL,  // l
256         "\x02" "il" "\x03" "obi" "\x05" "useum",  // mil, mobi, museum
257         "\x03" "ame" "\x02" "et",  // name, net
258         "\x02" "rg",  // , org
259         "\x02" "ro",  // pro
260         NULL,  // q
261         NULL,  // r
262         NULL,  // s
263         "\x05" "ravel",  // travel
264         NULL,  // u
265         NULL,  // v
266         NULL,  // w
267         NULL,  // x
268         NULL,  // y
269         NULL,  // z
270     };
271 
272     const UChar* start = chars;
273     const UChar* end = chars + length;
274     while (chars < end) {
275         UChar ch = *chars++;
276         if (ch != '@')
277             continue;
278         const UChar* atLocation = chars - 1;
279         // search for domain
280         ch = *chars++ | 0x20; // convert uppercase to lower
281         if (ch < 'a' || ch > 'z')
282             continue;
283         while (chars < end) {
284             ch = *chars++;
285             if (IsDomainChar(ch) == false)
286                 goto nextAt;
287             if (ch != '.')
288                 continue;
289             UChar firstLetter = *chars++ | 0x20; // first letter of the domain
290             if (chars >= end)
291                 return FOUND_NONE; // only one letter; must be at least two
292             firstLetter -= 'a';
293             if (firstLetter > 'z' - 'a')
294                 continue; // non-letter followed '.'
295             int secondLetterMask = domainTwoLetter[firstLetter];
296             ch = *chars | 0x20; // second letter of the domain
297             ch -= 'a';
298             if (ch >= 'z' - 'a')
299                 continue;
300             bool secondMatch = (secondLetterMask & 1 << ch) != 0;
301             const char* wordMatch = longDomainNames[firstLetter];
302             int wordIndex = 0;
303             while (wordMatch != NULL) {
304                 int len = *wordMatch++;
305                 char match;
306                 do {
307                     match = wordMatch[wordIndex];
308                     if (match < 0x20)
309                         goto foundDomainStart;
310                     if (chars[wordIndex] != match)
311                         break;
312                     wordIndex++;
313                 } while (true);
314                 wordMatch += len;
315                 if (*wordMatch == '\0')
316                     break;
317                 wordIndex = 0;
318             }
319             if (secondMatch) {
320                 wordIndex = 1;
321         foundDomainStart:
322                 chars += wordIndex;
323                 if (chars < end) {
324                     ch = *chars;
325                     if (ch != '.') {
326                         if (IsDomainChar(ch))
327                             goto nextDot;
328                     } else if (chars + 1 < end && IsDomainChar(chars[1]))
329                         goto nextDot;
330                 }
331                 // found domain. Search backwards from '@' for beginning of email address
332                 s->mEndResult = chars - start;
333                 chars = atLocation;
334                 if (chars <= start)
335                     goto nextAt;
336                 ch = *--chars;
337                 if (ch == '.')
338                     goto nextAt; // mailbox can't end in period
339                 do {
340                     if (IsMailboxChar(ch) == false) {
341                         chars++;
342                         break;
343                     }
344                     if (chars == start)
345                         break;
346                     ch = *--chars;
347                 } while (true);
348                 UChar firstChar = *chars;
349                 if (firstChar == '.' || firstChar == '@') // mailbox can't start with period or be empty
350                     goto nextAt;
351                 s->mStartResult = chars - start;
352                 return FOUND_COMPLETE;
353             }
354     nextDot:
355             ;
356         }
357 nextAt:
358         chars = atLocation + 1;
359     }
360     return FOUND_NONE;
361 }
362 
IsDomainChar(UChar ch)363 bool IsDomainChar(UChar ch)
364 {
365     static const unsigned body[] = {0x03ff6000, 0x07fffffe, 0x07fffffe}; // 0-9 . - A-Z a-z
366     ch -= 0x20;
367     if (ch > 'z' - 0x20)
368         return false;
369     return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0;
370 }
371 
IsMailboxChar(UChar ch)372 bool IsMailboxChar(UChar ch)
373 {
374     // According to http://en.wikipedia.org/wiki/Email_address
375     // ! # $ % & ' * + - . / 0-9 = ?
376     // A-Z ^ _
377     // ` a-z { | } ~
378     static const unsigned body[] = {0xa3ffecfa, 0xc7fffffe, 0x7fffffff};
379     ch -= 0x20;
380     if (ch > '~' - 0x20)
381         return false;
382     return (body[ch >> 5] & 1 << (ch & 0x1f)) != 0;
383 }
384