• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_UNICODE_H_
29 #define V8_UNICODE_H_
30 
31 #include <sys/types.h>
32 
33 /**
34  * \file
35  * Definitions and convenience functions for working with unicode.
36  */
37 
38 namespace unibrow {
39 
40 typedef unsigned int uchar;
41 typedef unsigned char byte;
42 
43 /**
44  * The max length of the result of converting the case of a single
45  * character.
46  */
47 const int kMaxMappingSize = 4;
48 
49 template <class T, int size = 256>
50 class Predicate {
51  public:
Predicate()52   inline Predicate() { }
53   inline bool get(uchar c);
54  private:
55   friend class Test;
56   bool CalculateValue(uchar c);
57   struct CacheEntry {
CacheEntryCacheEntry58     inline CacheEntry() : code_point_(0), value_(0) { }
CacheEntryCacheEntry59     inline CacheEntry(uchar code_point, bool value)
60       : code_point_(code_point),
61         value_(value) { }
62     uchar code_point_ : 21;
63     bool value_ : 1;
64   };
65   static const int kSize = size;
66   static const int kMask = kSize - 1;
67   CacheEntry entries_[kSize];
68 };
69 
70 // A cache used in case conversion.  It caches the value for characters
71 // that either have no mapping or map to a single character independent
72 // of context.  Characters that map to more than one character or that
73 // map differently depending on context are always looked up.
74 template <class T, int size = 256>
75 class Mapping {
76  public:
Mapping()77   inline Mapping() { }
78   inline int get(uchar c, uchar n, uchar* result);
79  private:
80   friend class Test;
81   int CalculateValue(uchar c, uchar n, uchar* result);
82   struct CacheEntry {
CacheEntryCacheEntry83     inline CacheEntry() : code_point_(kNoChar), offset_(0) { }
CacheEntryCacheEntry84     inline CacheEntry(uchar code_point, signed offset)
85       : code_point_(code_point),
86         offset_(offset) { }
87     uchar code_point_;
88     signed offset_;
89     static const int kNoChar = (1 << 21) - 1;
90   };
91   static const int kSize = size;
92   static const int kMask = kSize - 1;
93   CacheEntry entries_[kSize];
94 };
95 
96 class UnicodeData {
97  private:
98   friend class Test;
99   static int GetByteCount();
100   static const uchar kMaxCodePoint;
101 };
102 
103 // --- U t f   8   a n d   16 ---
104 
105 template <typename Data>
106 class Buffer {
107  public:
Buffer(Data data,unsigned length)108   inline Buffer(Data data, unsigned length) : data_(data), length_(length) { }
Buffer()109   inline Buffer() : data_(0), length_(0) { }
data()110   Data data() { return data_; }
length()111   unsigned length() { return length_; }
112  private:
113   Data data_;
114   unsigned length_;
115 };
116 
117 
118 class Utf16 {
119  public:
IsLeadSurrogate(int code)120   static inline bool IsLeadSurrogate(int code) {
121     if (code == kNoPreviousCharacter) return false;
122     return (code & 0xfc00) == 0xd800;
123   }
IsTrailSurrogate(int code)124   static inline bool IsTrailSurrogate(int code) {
125     if (code == kNoPreviousCharacter) return false;
126     return (code & 0xfc00) == 0xdc00;
127   }
128 
CombineSurrogatePair(uchar lead,uchar trail)129   static inline int CombineSurrogatePair(uchar lead, uchar trail) {
130     return 0x10000 + ((lead & 0x3ff) << 10) + (trail & 0x3ff);
131   }
132   static const int kNoPreviousCharacter = -1;
133   static const uchar kMaxNonSurrogateCharCode = 0xffff;
134   // Encoding a single UTF-16 code unit will produce 1, 2 or 3 bytes
135   // of UTF-8 data.  The special case where the unit is a surrogate
136   // trail produces 1 byte net, because the encoding of the pair is
137   // 4 bytes and the 3 bytes that were used to encode the lead surrogate
138   // can be reclaimed.
139   static const int kMaxExtraUtf8BytesForOneUtf16CodeUnit = 3;
140   // One UTF-16 surrogate is endoded (illegally) as 3 UTF-8 bytes.
141   // The illegality stems from the surrogate not being part of a pair.
142   static const int kUtf8BytesToCodeASurrogate = 3;
LeadSurrogate(int char_code)143   static inline uchar LeadSurrogate(int char_code) {
144     return 0xd800 + (((char_code - 0x10000) >> 10) & 0x3ff);
145   }
TrailSurrogate(int char_code)146   static inline uchar TrailSurrogate(int char_code) {
147     return 0xdc00 + (char_code & 0x3ff);
148   }
149 };
150 
151 
152 class Utf8 {
153  public:
154   static inline uchar Length(uchar chr, int previous);
155   static inline unsigned Encode(
156       char* out, uchar c, int previous);
157   static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
158       unsigned capacity, unsigned* chars_read, unsigned* offset);
159   static uchar CalculateValue(const byte* str,
160                               unsigned length,
161                               unsigned* cursor);
162   static const uchar kBadChar = 0xFFFD;
163   static const unsigned kMaxEncodedSize   = 4;
164   static const unsigned kMaxOneByteChar   = 0x7f;
165   static const unsigned kMaxTwoByteChar   = 0x7ff;
166   static const unsigned kMaxThreeByteChar = 0xffff;
167   static const unsigned kMaxFourByteChar  = 0x1fffff;
168 
169   // A single surrogate is coded as a 3 byte UTF-8 sequence, but two together
170   // that match are coded as a 4 byte UTF-8 sequence.
171   static const unsigned kBytesSavedByCombiningSurrogates = 2;
172   static const unsigned kSizeOfUnmatchedSurrogate = 3;
173 
174  private:
175   template <unsigned s> friend class Utf8InputBuffer;
176   friend class Test;
177   static inline uchar ValueOf(const byte* str,
178                               unsigned length,
179                               unsigned* cursor);
180 };
181 
182 // --- C h a r a c t e r   S t r e a m ---
183 
184 class CharacterStream {
185  public:
186   inline uchar GetNext();
has_more()187   inline bool has_more() { return remaining_ != 0; }
188   // Note that default implementation is not efficient.
189   virtual void Seek(unsigned);
190   unsigned Length();
191   unsigned Utf16Length();
~CharacterStream()192   virtual ~CharacterStream() { }
193   static inline bool EncodeCharacter(uchar c, byte* buffer, unsigned capacity,
194       unsigned& offset);
195   static inline bool EncodeAsciiCharacter(uchar c, byte* buffer,
196       unsigned capacity, unsigned& offset);
197   static inline bool EncodeNonAsciiCharacter(uchar c, byte* buffer,
198       unsigned capacity, unsigned& offset);
199   static inline uchar DecodeCharacter(const byte* buffer, unsigned* offset);
200   virtual void Rewind() = 0;
201 
202  protected:
203   virtual void FillBuffer() = 0;
204   // The number of characters left in the current buffer
205   unsigned remaining_;
206   // The current offset within the buffer
207   unsigned cursor_;
208   // The buffer containing the decoded characters.
209   const byte* buffer_;
210 };
211 
212 // --- I n p u t   B u f f e r ---
213 
214 /**
215  * Provides efficient access to encoded characters in strings.  It
216  * does so by reading characters one block at a time, rather than one
217  * character at a time, which gives string implementations an
218  * opportunity to optimize the decoding.
219  */
220 template <class Reader, class Input = Reader*, unsigned kSize = 256>
221 class InputBuffer : public CharacterStream {
222  public:
223   virtual void Rewind();
224   inline void Reset(Input input);
225   void Seek(unsigned position);
226   inline void Reset(unsigned position, Input input);
227  protected:
InputBuffer()228   InputBuffer() { }
InputBuffer(Input input)229   explicit InputBuffer(Input input) { Reset(input); }
230   virtual void FillBuffer();
231 
232   // A custom offset that can be used by the string implementation to
233   // mark progress within the encoded string.
234   unsigned offset_;
235   // The input string
236   Input input_;
237   // To avoid heap allocation, we keep an internal buffer to which
238   // the encoded string can write its characters.  The string
239   // implementation is free to decide whether it wants to use this
240   // buffer or not.
241   byte util_buffer_[kSize];
242 };
243 
244 // --- U t f 8   I n p u t   B u f f e r ---
245 
246 template <unsigned s = 256>
247 class Utf8InputBuffer : public InputBuffer<Utf8, Buffer<const char*>, s> {
248  public:
Utf8InputBuffer()249   inline Utf8InputBuffer() { }
250   inline Utf8InputBuffer(const char* data, unsigned length);
Reset(const char * data,unsigned length)251   inline void Reset(const char* data, unsigned length) {
252     InputBuffer<Utf8, Buffer<const char*>, s>::Reset(
253         Buffer<const char*>(data, length));
254   }
255 };
256 
257 
258 struct Uppercase {
259   static bool Is(uchar c);
260 };
261 struct Lowercase {
262   static bool Is(uchar c);
263 };
264 struct Letter {
265   static bool Is(uchar c);
266 };
267 struct Space {
268   static bool Is(uchar c);
269 };
270 struct Number {
271   static bool Is(uchar c);
272 };
273 struct WhiteSpace {
274   static bool Is(uchar c);
275 };
276 struct LineTerminator {
277   static bool Is(uchar c);
278 };
279 struct CombiningMark {
280   static bool Is(uchar c);
281 };
282 struct ConnectorPunctuation {
283   static bool Is(uchar c);
284 };
285 struct ToLowercase {
286   static const int kMaxWidth = 3;
287   static int Convert(uchar c,
288                      uchar n,
289                      uchar* result,
290                      bool* allow_caching_ptr);
291 };
292 struct ToUppercase {
293   static const int kMaxWidth = 3;
294   static int Convert(uchar c,
295                      uchar n,
296                      uchar* result,
297                      bool* allow_caching_ptr);
298 };
299 struct Ecma262Canonicalize {
300   static const int kMaxWidth = 1;
301   static int Convert(uchar c,
302                      uchar n,
303                      uchar* result,
304                      bool* allow_caching_ptr);
305 };
306 struct Ecma262UnCanonicalize {
307   static const int kMaxWidth = 4;
308   static int Convert(uchar c,
309                      uchar n,
310                      uchar* result,
311                      bool* allow_caching_ptr);
312 };
313 struct CanonicalizationRange {
314   static const int kMaxWidth = 1;
315   static int Convert(uchar c,
316                      uchar n,
317                      uchar* result,
318                      bool* allow_caching_ptr);
319 };
320 
321 }  // namespace unibrow
322 
323 #endif  // V8_UNICODE_H_
324