• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_UNICODE_DECODER_H_
6 #define V8_UNICODE_DECODER_H_
7 
8 #include <sys/types.h>
9 #include "src/globals.h"
10 
11 namespace unibrow {
12 
13 class Utf8DecoderBase {
14  public:
15   // Initialization done in subclass.
16   inline Utf8DecoderBase();
17   inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
18                          const uint8_t* stream, size_t stream_length);
Utf16Length()19   inline size_t Utf16Length() const { return utf16_length_; }
20 
21  protected:
22   // This reads all characters and sets the utf16_length_.
23   // The first buffer_length utf16 chars are cached in the buffer.
24   void Reset(uint16_t* buffer, size_t buffer_length, const uint8_t* stream,
25              size_t stream_length);
26   static void WriteUtf16Slow(const uint8_t* stream, size_t stream_length,
27                              uint16_t* data, size_t length);
28   const uint8_t* unbuffered_start_;
29   size_t unbuffered_length_;
30   size_t utf16_length_;
31   bool last_byte_of_buffer_unused_;
32 
33  private:
34   DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
35 };
36 
37 template <size_t kBufferSize>
38 class Utf8Decoder : public Utf8DecoderBase {
39  public:
Utf8Decoder()40   inline Utf8Decoder() {}
41   inline Utf8Decoder(const char* stream, size_t length);
42   inline void Reset(const char* stream, size_t length);
43   inline size_t WriteUtf16(uint16_t* data, size_t length) const;
44 
45  private:
46   uint16_t buffer_[kBufferSize];
47 };
48 
49 
Utf8DecoderBase()50 Utf8DecoderBase::Utf8DecoderBase()
51     : unbuffered_start_(NULL),
52       unbuffered_length_(0),
53       utf16_length_(0),
54       last_byte_of_buffer_unused_(false) {}
55 
56 
Utf8DecoderBase(uint16_t * buffer,size_t buffer_length,const uint8_t * stream,size_t stream_length)57 Utf8DecoderBase::Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
58                                  const uint8_t* stream, size_t stream_length) {
59   Reset(buffer, buffer_length, stream, stream_length);
60 }
61 
62 
63 template <size_t kBufferSize>
Utf8Decoder(const char * stream,size_t length)64 Utf8Decoder<kBufferSize>::Utf8Decoder(const char* stream, size_t length)
65     : Utf8DecoderBase(buffer_, kBufferSize,
66                       reinterpret_cast<const uint8_t*>(stream), length) {}
67 
68 
69 template <size_t kBufferSize>
Reset(const char * stream,size_t length)70 void Utf8Decoder<kBufferSize>::Reset(const char* stream, size_t length) {
71   Utf8DecoderBase::Reset(buffer_, kBufferSize,
72                          reinterpret_cast<const uint8_t*>(stream), length);
73 }
74 
75 
76 template <size_t kBufferSize>
WriteUtf16(uint16_t * data,size_t length)77 size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
78                                             size_t length) const {
79   DCHECK(length > 0);
80   if (length > utf16_length_) length = utf16_length_;
81   // memcpy everything in buffer.
82   size_t buffer_length =
83       last_byte_of_buffer_unused_ ? kBufferSize - 1 : kBufferSize;
84   size_t memcpy_length = length <= buffer_length ? length : buffer_length;
85   v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
86   if (length <= buffer_length) return length;
87   DCHECK(unbuffered_start_ != NULL);
88   // Copy the rest the slow way.
89   WriteUtf16Slow(unbuffered_start_, unbuffered_length_, data + buffer_length,
90                  length - buffer_length);
91   return length;
92 }
93 
94 class Latin1 {
95  public:
96   static const unsigned kMaxChar = 0xff;
97   // Returns 0 if character does not convert to single latin-1 character
98   // or if the character doesn't not convert back to latin-1 via inverse
99   // operation (upper to lower, etc).
100   static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
101 };
102 
103 
ConvertNonLatin1ToLatin1(uint16_t c)104 uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
105   DCHECK(c > Latin1::kMaxChar);
106   switch (c) {
107     // This are equivalent characters in unicode.
108     case 0x39c:
109     case 0x3bc:
110       return 0xb5;
111     // This is an uppercase of a Latin-1 character
112     // outside of Latin-1.
113     case 0x178:
114       return 0xff;
115   }
116   return 0;
117 }
118 
119 
120 }  // namespace unibrow
121 
122 #endif  // V8_UNICODE_DECODER_H_
123