1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ 6 #define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ 7 8 #include "encodings/compact_lang_det/letterscript_enum.h" 9 #include "encodings/compact_lang_det/compact_lang_det_impl.h" 10 11 namespace getone { 12 static const int kMaxScriptBuffer = 4096; 13 static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; 14 static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room 15 static const int kMaxAnswerBuffer = 256; 16 17 typedef enum UnicodeLScript ULScript; 18 19 typedef struct { 20 char* text; // Pointer to the span, somewhere 21 int text_bytes; // Number of bytes of text in the span 22 int offset; // Offset of start of span in original input buffer 23 ULScript script; // Script of all the letters in this span 24 Language lang; // Language identified for this span 25 bool truncated; // true if buffer filled up before a 26 // different script or EOF was found 27 } LangSpan; 28 29 IsContinuationByte(char c)30 static inline bool IsContinuationByte(char c) { 31 return static_cast<signed char>(c) < -64; 32 } 33 34 // Gets lscript number for letters; always returns 35 // 0 (common script) for non-letters 36 int GetUTF8LetterScriptNum(const char* src); 37 38 39 // Update src pointer to point to next quadgram, +2..+5 40 // Looks at src[0..4] 41 const char* AdvanceQuad(const char* src); 42 } // end namespace getone 43 44 45 46 47 48 49 class ScriptScanner { 50 public: 51 ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); 52 ~ScriptScanner(); 53 54 // Copy next run of same-script non-tag letters to buffer [NUL terminated] 55 bool GetOneScriptSpan(getone::LangSpan* span); 56 57 // Force Latin and Cyrillic scripts to be lowercase 58 void LowerScriptSpan(getone::LangSpan* span); 59 60 // Copy next run of same-script non-tag letters to buffer [NUL terminated] 61 // Force Latin and Cyrillic scripts to be lowercase 62 bool GetOneScriptSpanLower(getone::LangSpan* span); 63 64 private: 65 int SkipToFrontOfSpan(const char* src, int len, int* script); 66 67 const char* start_byte_; 68 const char* next_byte_; 69 const char* next_byte_limit_; 70 int byte_length_; 71 bool is_plain_text_; 72 char* script_buffer_; // Holds text with expanded entities 73 char* script_buffer_lower_; // Holds lowercased text 74 }; 75 76 77 class LangScanner { 78 public: 79 LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj, 80 getone::LangSpan* spn, int smoothwidth, int smoothcandidates, 81 int maxlangs, int minlangspan); 82 ~LangScanner(); 83 84 script()85 int script() {return script_;} 86 87 // Use new text 88 // Keep smoothing state if same script, otherwise reinit smoothing 89 void NewText(getone::LangSpan* spn); 90 91 bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping 92 bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping 93 94 // The real ones 95 bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, 96 getone::LangSpan* span); 97 bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, 98 getone::LangSpan* span); 99 100 // Increases language bias by delta 101 void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj, 102 Language key, int delta); 103 104 // For debugging output 105 int next_answer_; 106 char answer_buffer_[getone::kMaxAnswerBuffer]; 107 char answer_buffer2_[getone::kMaxAnswerBuffer]; 108 char answer_buffer3_[getone::kMaxAnswerBuffer]; 109 char answer_buffer4_[getone::kMaxAnswerBuffer]; 110 111 private: 112 const char* start_byte_; 113 const char* next_byte_limit_; 114 const char* next_byte_; 115 const char* onelangspan_begin_; 116 int byte_length_; 117 int script_; 118 Language spanlang_; 119 int smoothwidth_; 120 int smoothwidth_2_; 121 int smoothcandidates_; 122 int maxlangs_; 123 int minlangspan_; 124 int rb_size_; 125 int next_rb_; 126 int rb_mask_; 127 uint32* rb_; 128 int* offset_rb_; 129 }; 130 131 #endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ 132