• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
6 #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
7 
8 #include "encodings/compact_lang_det/letterscript_enum.h"
9 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
10 
11 namespace getone {
12   static const int kMaxScriptBuffer = 4096;
13   static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
14   static const int kMaxScriptBytes = kMaxScriptBuffer- 8;   // Leave some room
15   static const int kMaxAnswerBuffer = 256;
16 
17   typedef enum UnicodeLScript ULScript;
18 
19   typedef struct {
20     char* text;             // Pointer to the span, somewhere
21     int text_bytes;         // Number of bytes of text in the span
22     int offset;             // Offset of start of span in original input buffer
23     ULScript script;        // Script of all the letters in this span
24     Language lang;          // Language identified for this span
25     bool truncated;         // true if buffer filled up before a
26                             // different script or EOF was found
27   } LangSpan;
28 
29 
IsContinuationByte(char c)30   static inline bool IsContinuationByte(char c) {
31     return static_cast<signed char>(c) < -64;
32   }
33 
34   // Gets lscript number for letters; always returns
35   //   0 (common script) for non-letters
36   int GetUTF8LetterScriptNum(const char* src);
37 
38 
39   // Update src pointer to point to next quadgram, +2..+5
40   // Looks at src[0..4]
41   const char* AdvanceQuad(const char* src);
42 }     // end namespace getone
43 
44 
45 
46 
47 
48 
49 class ScriptScanner {
50  public:
51   ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
52   ~ScriptScanner();
53 
54   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
55   bool GetOneScriptSpan(getone::LangSpan* span);
56 
57   // Force Latin and Cyrillic scripts to be lowercase
58   void LowerScriptSpan(getone::LangSpan* span);
59 
60   // Copy next run of same-script non-tag letters to buffer [NUL terminated]
61   // Force Latin and Cyrillic scripts to be lowercase
62   bool GetOneScriptSpanLower(getone::LangSpan* span);
63 
64  private:
65   int SkipToFrontOfSpan(const char* src, int len, int* script);
66 
67   const char* start_byte_;
68   const char* next_byte_;
69   const char* next_byte_limit_;
70   int byte_length_;
71   bool is_plain_text_;
72   char* script_buffer_;           // Holds text with expanded entities
73   char* script_buffer_lower_;     // Holds lowercased text
74 };
75 
76 
77 class LangScanner {
78  public:
79   LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
80               getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
81               int maxlangs, int minlangspan);
82   ~LangScanner();
83 
84 
script()85   int script() {return script_;}
86 
87   // Use new text
88   // Keep smoothing state if same script, otherwise reinit smoothing
89   void NewText(getone::LangSpan* spn);
90 
91   bool GetOneShortLangSpanBoot(getone::LangSpan* span);  // Just for bootstrapping
92   bool GetOneLangSpanBoot(getone::LangSpan* span);       // Just for bootstrapping
93 
94   // The real ones
95   bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
96                            getone::LangSpan* span);
97   bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
98                       getone::LangSpan* span);
99 
100   // Increases language bias by delta
101   void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
102                        Language key, int delta);
103 
104   // For debugging output
105   int next_answer_;
106   char answer_buffer_[getone::kMaxAnswerBuffer];
107   char answer_buffer2_[getone::kMaxAnswerBuffer];
108   char answer_buffer3_[getone::kMaxAnswerBuffer];
109   char answer_buffer4_[getone::kMaxAnswerBuffer];
110 
111  private:
112   const char* start_byte_;
113   const char* next_byte_limit_;
114   const char* next_byte_;
115   const char* onelangspan_begin_;
116   int byte_length_;
117   int script_;
118   Language spanlang_;
119   int smoothwidth_;
120   int smoothwidth_2_;
121   int smoothcandidates_;
122   int maxlangs_;
123   int minlangspan_;
124   int rb_size_;
125   int next_rb_;
126   int rb_mask_;
127   uint32* rb_;
128   int* offset_rb_;
129 };
130 
131 #endif  // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
132