• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <unistd.h>
18 #include <fstream>
19 #include <regex>
20 
21 #include "utils/file_utils.h"
22 #include "minddata/dataset/include/dataset/text.h"
23 #include "minddata/dataset/core/type_id.h"
24 #include "minddata/dataset/text/ir/kernels/text_ir.h"
25 #include "mindspore/core/ir/dtype/type_id.h"
26 
27 namespace mindspore {
28 namespace dataset {
29 
30 // Transform operations for text.
31 namespace text {
32 
33 constexpr size_t size_two = 2;
34 constexpr size_t size_three = 3;
35 constexpr int64_t value_one = 1;
36 constexpr int64_t value_two = 2;
37 constexpr size_t kMaxLoggedRows = 10;
38 
39 // FUNCTIONS TO CREATE TEXT OPERATIONS
40 // (In alphabetical order)
41 
42 #ifndef _WIN32
43 // BasicTokenizer
44 struct BasicTokenizer::Data {
Datamindspore::dataset::text::BasicTokenizer::Data45   Data(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
46        bool with_offsets)
47       : lower_case_(lower_case),
48         keep_whitespace_(keep_whitespace),
49         normalize_form_(normalize_form),
50         preserve_unused_token_(preserve_unused_token),
51         with_offsets_(with_offsets) {}
52   bool lower_case_;
53   bool keep_whitespace_;
54   NormalizeForm normalize_form_;
55   bool preserve_unused_token_;
56   bool with_offsets_;
57 };
58 
BasicTokenizer(bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)59 BasicTokenizer::BasicTokenizer(bool lower_case, bool keep_whitespace, const NormalizeForm normalize_form,
60                                bool preserve_unused_token, bool with_offsets)
61     : data_(std::make_shared<Data>(lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
62 
Parse()63 std::shared_ptr<TensorOperation> BasicTokenizer::Parse() {
64   return std::make_shared<BasicTokenizerOperation>(data_->lower_case_, data_->keep_whitespace_, data_->normalize_form_,
65                                                    data_->preserve_unused_token_, data_->with_offsets_);
66 }
67 
68 // BertTokenizer
69 struct BertTokenizer::Data {
Datamindspore::dataset::text::BertTokenizer::Data70   Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
71        const std::vector<char> &unknown_token, bool lower_case, bool keep_whitespace,
72        const NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets)
73       : vocab_(vocab),
74         suffix_indicator_(CharToString(suffix_indicator)),
75         max_bytes_per_token_(max_bytes_per_token),
76         unknown_token_(CharToString(unknown_token)),
77         lower_case_(lower_case),
78         keep_whitespace_(keep_whitespace),
79         normalize_form_(normalize_form),
80         preserve_unused_token_(preserve_unused_token),
81         with_offsets_(with_offsets) {}
82   std::shared_ptr<Vocab> vocab_;
83   std::string suffix_indicator_;
84   int32_t max_bytes_per_token_;
85   std::string unknown_token_;
86   bool lower_case_;
87   bool keep_whitespace_;
88   NormalizeForm normalize_form_;
89   bool preserve_unused_token_;
90   bool with_offsets_;
91 };
92 
BertTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool lower_case,bool keep_whitespace,const NormalizeForm normalize_form,bool preserve_unused_token,bool with_offsets)93 BertTokenizer::BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
94                              int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
95                              bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
96                              bool with_offsets)
97     : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, lower_case,
98                                    keep_whitespace, normalize_form, preserve_unused_token, with_offsets)) {}
99 
Parse()100 std::shared_ptr<TensorOperation> BertTokenizer::Parse() {
101   return std::make_shared<BertTokenizerOperation>(
102     data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->lower_case_,
103     data_->keep_whitespace_, data_->normalize_form_, data_->preserve_unused_token_, data_->with_offsets_);
104 }
105 
106 // CaseFold
CaseFold()107 CaseFold::CaseFold() {}
108 
Parse()109 std::shared_ptr<TensorOperation> CaseFold::Parse() { return std::make_shared<CaseFoldOperation>(); }
110 #endif
111 
112 // JiebaTokenizer
113 struct JiebaTokenizer::Data {
Datamindspore::dataset::text::JiebaTokenizer::Data114   Data(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, bool with_offsets)
115       : hmm_path_(CharToString(hmm_path)),
116         mp_path_(CharToString(mp_path)),
117         mode_(mode),
118         with_offsets_(with_offsets),
119         words_list_({}) {}
120   std::string hmm_path_;
121   std::string mp_path_;
122   JiebaMode mode_;
123   bool with_offsets_;
124   std::vector<std::pair<std::string, int64_t>> words_list_;
125 };
126 
JiebaTokenizer(const std::vector<char> & hmm_path,const std::vector<char> & mp_path,const JiebaMode & mode,bool with_offsets)127 JiebaTokenizer::JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path,
128                                const JiebaMode &mode, bool with_offsets)
129     : data_(std::make_shared<Data>(hmm_path, mp_path, mode, with_offsets)) {}
130 
Parse()131 std::shared_ptr<TensorOperation> JiebaTokenizer::Parse() {
132   std::shared_ptr<JiebaTokenizerOperation> jieba_tokenizer =
133     std::make_shared<JiebaTokenizerOperation>(data_->hmm_path_, data_->mp_path_, data_->mode_, data_->with_offsets_);
134   for (auto &word : data_->words_list_) {
135     Status rc = jieba_tokenizer->AddWord(word.first, word.second);
136     if (rc.IsError()) {
137       MS_LOG(ERROR) << rc;
138       return {};
139     }
140   }
141   return jieba_tokenizer;
142 }
143 
AddWordChar(const std::vector<char> & word,int64_t freq)144 Status JiebaTokenizer::AddWordChar(const std::vector<char> &word, int64_t freq) {
145   if (word.empty()) {
146     std::string err_msg = "JiebaTokenizer : The parameter word is empty or not provided.";
147     MS_LOG(ERROR) << err_msg;
148     RETURN_STATUS_SYNTAX_ERROR(err_msg);
149   }
150   if (freq < 0) {
151     std::string err_msg = "JiebaTokenizer : The parameter freq must be greater than or equal to 0.";
152     MS_LOG(ERROR) << err_msg;
153     RETURN_STATUS_SYNTAX_ERROR(err_msg);
154   }
155   (void)data_->words_list_.emplace_back(CharToString(word), freq);
156   return Status::OK();
157 }
158 
AddDictChar(const std::vector<std::pair<std::vector<char>,int64_t>> & user_dict)159 Status JiebaTokenizer::AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict) {
160   for (auto &word_freq_pair : user_dict) {
161     RETURN_IF_NOT_OK(AddWordChar(word_freq_pair.first, word_freq_pair.second));
162   }
163   return Status::OK();
164 }
165 
AddDictChar(const std::vector<char> & file_path)166 Status JiebaTokenizer::AddDictChar(const std::vector<char> &file_path) {
167   std::vector<std::pair<std::string, int64_t>> user_dict;
168   RETURN_IF_NOT_OK(ParserFile(CharToString(file_path), &user_dict));
169   RETURN_IF_NOT_OK(AddDictChar(PairStringInt64ToPairCharInt64(user_dict)));
170   return Status::OK();
171 }
172 
ParserFile(const std::string & file_path,std::vector<std::pair<std::string,int64_t>> * const user_dict)173 Status JiebaTokenizer::ParserFile(const std::string &file_path,
174                                   std::vector<std::pair<std::string, int64_t>> *const user_dict) {
175   auto realpath = FileUtils::GetRealPath(file_path.data());
176   if (!realpath.has_value()) {
177     MS_LOG(ERROR) << "Get real path failed, path=" << file_path;
178     RETURN_STATUS_SYNTAX_ERROR("Get real path failed, path=" + file_path);
179   }
180 
181   std::ifstream ifs(realpath.value());
182   if (!ifs) {
183     std::string err_msg = "JiebaTokenizer : Fail to load dictionary from the input file, check the file path.";
184     MS_LOG(ERROR) << err_msg;
185     RETURN_STATUS_SYNTAX_ERROR(err_msg);
186   }
187 
188   std::string line;
189   while (std::getline(ifs, line)) {
190     if (line.empty()) {
191       continue;
192     }
193     std::regex regex("^\\s*([^\\s*]+?)\\s*([0-9]+)?\\s*$");
194     std::smatch tokens;
195     if (std::regex_match(line, tokens, regex)) {
196       if (tokens.size() == size_two) {
197         (void)user_dict->emplace_back(tokens.str(value_one), 0);
198       } else if (tokens.size() == size_three) {
199         (void)user_dict->emplace_back(tokens.str(value_one), strtoll(tokens.str(value_two).c_str(), NULL, 0));
200       } else {
201         continue;
202       }
203     } else {
204       continue;
205     }
206   }
207   MS_LOG(INFO) << "JiebaTokenizer::AddDict: The size of user input dictionary is: " << user_dict->size();
208   MS_LOG(INFO) << "Valid rows in input dictionary (Maximum of first 10 rows are shown.):";
209   for (std::size_t i = 0; i != user_dict->size(); ++i) {
210     if (i >= kMaxLoggedRows) break;
211     MS_LOG(INFO) << user_dict->at(i).first << " " << user_dict->at(i).second;
212   }
213   return Status::OK();
214 }
215 
216 // Lookup
217 struct Lookup::Data {
Datamindspore::dataset::text::Lookup::Data218   Data(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
219        mindspore::DataType data_type)
220       : vocab_(vocab), data_type_(dataset::MSTypeToDEType(static_cast<TypeId>(data_type))) {
221     if (unknown_token == std::nullopt) {
222       unknown_token_ = std::nullopt;
223     } else {
224       unknown_token_ = std::string(unknown_token->begin(), unknown_token->end());
225     }
226   }
227   std::shared_ptr<Vocab> vocab_;
228   std::optional<std::string> unknown_token_;
229   dataset::DataType data_type_;
230 };
231 
Lookup(const std::shared_ptr<Vocab> & vocab,const std::optional<std::vector<char>> & unknown_token,mindspore::DataType data_type)232 Lookup::Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
233                mindspore::DataType data_type)
234     : data_(std::make_shared<Data>(vocab, unknown_token, data_type)) {
235   data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
236 }
237 
Parse()238 std::shared_ptr<TensorOperation> Lookup::Parse() {
239   return std::make_shared<LookupOperation>(data_->vocab_, data_->unknown_token_, data_->data_type_);
240 }
241 
242 // Ngram
243 struct Ngram::Data {
Datamindspore::dataset::text::Ngram::Data244   Data(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
245        const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
246       : ngrams_(ngrams),
247         left_pad_(PairCharToString(left_pad)),
248         right_pad_(PairCharToString(right_pad)),
249         separator_(CharToString(separator)) {}
250   std::vector<int32_t> ngrams_;
251   std::pair<std::string, int32_t> left_pad_;
252   std::pair<std::string, int32_t> right_pad_;
253   std::string separator_;
254 };
255 
Ngram(const std::vector<int32_t> & ngrams,const std::pair<std::vector<char>,int32_t> & left_pad,const std::pair<std::vector<char>,int32_t> & right_pad,const std::vector<char> & separator)256 Ngram::Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
257              const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator)
258     : data_(std::make_shared<Data>(ngrams, left_pad, right_pad, separator)) {}
259 
Parse()260 std::shared_ptr<TensorOperation> Ngram::Parse() {
261   return std::make_shared<NgramOperation>(data_->ngrams_, data_->left_pad_, data_->right_pad_, data_->separator_);
262 }
263 
264 #ifndef _WIN32
265 // NormalizeUTF8
266 struct NormalizeUTF8::Data {
Datamindspore::dataset::text::NormalizeUTF8::Data267   explicit Data(NormalizeForm normalize_form) : normalize_form_(normalize_form) {}
268   NormalizeForm normalize_form_;
269 };
270 
NormalizeUTF8(NormalizeForm normalize_form)271 NormalizeUTF8::NormalizeUTF8(NormalizeForm normalize_form) : data_(std::make_shared<Data>(normalize_form)) {}
272 
Parse()273 std::shared_ptr<TensorOperation> NormalizeUTF8::Parse() {
274   return std::make_shared<NormalizeUTF8Operation>(data_->normalize_form_);
275 }
276 
277 // RegexReplace
278 struct RegexReplace::Data {
Datamindspore::dataset::text::RegexReplace::Data279   Data(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
280       : pattern_(CharToString(pattern)), replace_(CharToString(replace)), replace_all_(replace_all) {}
281   std::string pattern_;
282   std::string replace_;
283   bool replace_all_;
284 };
285 
RegexReplace(const std::vector<char> & pattern,const std::vector<char> & replace,bool replace_all)286 RegexReplace::RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all)
287     : data_(std::make_shared<Data>(pattern, replace, replace_all)) {}
288 
Parse()289 std::shared_ptr<TensorOperation> RegexReplace::Parse() {
290   return std::make_shared<RegexReplaceOperation>(data_->pattern_, data_->replace_, data_->replace_all_);
291 }
292 
293 // RegexTokenizer
294 struct RegexTokenizer::Data {
Datamindspore::dataset::text::RegexTokenizer::Data295   Data(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, bool with_offsets)
296       : delim_pattern_(CharToString(delim_pattern)),
297         keep_delim_pattern_(CharToString(keep_delim_pattern)),
298         with_offsets_(with_offsets) {}
299   std::string delim_pattern_;
300   std::string keep_delim_pattern_;
301   bool with_offsets_;
302 };
303 
RegexTokenizer(const std::vector<char> & delim_pattern,const std::vector<char> & keep_delim_pattern,bool with_offsets)304 RegexTokenizer::RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
305                                bool with_offsets)
306     : data_(std::make_shared<Data>(delim_pattern, keep_delim_pattern, with_offsets)) {}
307 
Parse()308 std::shared_ptr<TensorOperation> RegexTokenizer::Parse() {
309   return std::make_shared<RegexTokenizerOperation>(data_->delim_pattern_, data_->keep_delim_pattern_,
310                                                    data_->with_offsets_);
311 }
312 #endif
313 
314 // SentencePieceTokenizer
315 struct SentencePieceTokenizer::Data {
Datamindspore::dataset::text::SentencePieceTokenizer::Data316   Data(const std::shared_ptr<SentencePieceVocab> &vocab, SPieceTokenizerOutType out_type)
317       : vocab_(vocab), vocab_path_(""), out_type_(out_type) {}
Datamindspore::dataset::text::SentencePieceTokenizer::Data318   Data(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
319       : vocab_(nullptr), vocab_path_(CharToString(vocab_path)), out_type_(out_type) {}
320   std::shared_ptr<SentencePieceVocab> vocab_;
321   std::string vocab_path_;
322   SPieceTokenizerOutType out_type_;
323 };
324 
SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> & vocab,SPieceTokenizerOutType out_type)325 SentencePieceTokenizer::SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
326                                                SPieceTokenizerOutType out_type)
327     : data_(std::make_shared<Data>(vocab, out_type)) {}
328 
SentencePieceTokenizer(const std::vector<char> & vocab_path,SPieceTokenizerOutType out_type)329 SentencePieceTokenizer::SentencePieceTokenizer(const std::vector<char> &vocab_path, SPieceTokenizerOutType out_type)
330     : data_(std::make_shared<Data>(vocab_path, out_type)) {}
331 
Parse()332 std::shared_ptr<TensorOperation> SentencePieceTokenizer::Parse() {
333   if (data_->vocab_ != nullptr) {
334     return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_, data_->out_type_);
335   } else {
336     return std::make_shared<SentencePieceTokenizerOperation>(data_->vocab_path_, data_->out_type_);
337   }
338 }
339 
340 // SlidingWindow
341 struct SlidingWindow::Data {
Datamindspore::dataset::text::SlidingWindow::Data342   Data(const int32_t width, const int32_t axis) : width_(width), axis_(axis) {}
343   int32_t width_;
344   int32_t axis_;
345 };
346 
SlidingWindow(const int32_t width,const int32_t axis)347 SlidingWindow::SlidingWindow(const int32_t width, const int32_t axis) : data_(std::make_shared<Data>(width, axis)) {}
348 
Parse()349 std::shared_ptr<TensorOperation> SlidingWindow::Parse() {
350   return std::make_shared<SlidingWindowOperation>(data_->width_, data_->axis_);
351 }
352 
353 // ToNumber
354 struct ToNumber::Data {
355   dataset::DataType data_type_;
356 };
357 
ToNumber(mindspore::DataType data_type)358 ToNumber::ToNumber(mindspore::DataType data_type) : data_(std::make_shared<Data>()) {
359   data_->data_type_ = dataset::MSTypeToDEType(static_cast<TypeId>(data_type));
360 }
361 
Parse()362 std::shared_ptr<TensorOperation> ToNumber::Parse() { return std::make_shared<ToNumberOperation>(data_->data_type_); }
363 
364 // TruncateSequencePair
365 struct TruncateSequencePair::Data {
Datamindspore::dataset::text::TruncateSequencePair::Data366   explicit Data(int32_t max_length) : max_length_(max_length) {}
367   int32_t max_length_;
368 };
369 
TruncateSequencePair(int32_t max_length)370 TruncateSequencePair::TruncateSequencePair(int32_t max_length) : data_(std::make_shared<Data>(max_length)) {}
371 
Parse()372 std::shared_ptr<TensorOperation> TruncateSequencePair::Parse() {
373   return std::make_shared<TruncateSequencePairOperation>(data_->max_length_);
374 }
375 
376 // UnicodeCharTokenizer
377 struct UnicodeCharTokenizer::Data {
Datamindspore::dataset::text::UnicodeCharTokenizer::Data378   explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
379   bool with_offsets_;
380 };
381 
UnicodeCharTokenizer(bool with_offsets)382 UnicodeCharTokenizer::UnicodeCharTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
383 
Parse()384 std::shared_ptr<TensorOperation> UnicodeCharTokenizer::Parse() {
385   return std::make_shared<UnicodeCharTokenizerOperation>(data_->with_offsets_);
386 }
387 
388 // WordpieceTokenizer
389 struct WordpieceTokenizer::Data {
Datamindspore::dataset::text::WordpieceTokenizer::Data390   Data(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, int32_t max_bytes_per_token,
391        const std::vector<char> &unknown_token, bool with_offsets)
392       : vocab_(vocab),
393         suffix_indicator_(CharToString(suffix_indicator)),
394         max_bytes_per_token_(max_bytes_per_token),
395         unknown_token_(CharToString(unknown_token)),
396         with_offsets_(with_offsets) {}
397   std::shared_ptr<Vocab> vocab_;
398   std::string suffix_indicator_;
399   int32_t max_bytes_per_token_;
400   std::string unknown_token_;
401   bool with_offsets_;
402 };
403 
WordpieceTokenizer(const std::shared_ptr<Vocab> & vocab,const std::vector<char> & suffix_indicator,int32_t max_bytes_per_token,const std::vector<char> & unknown_token,bool with_offsets)404 WordpieceTokenizer::WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
405                                        int32_t max_bytes_per_token, const std::vector<char> &unknown_token,
406                                        bool with_offsets)
407     : data_(std::make_shared<Data>(vocab, suffix_indicator, max_bytes_per_token, unknown_token, with_offsets)) {}
408 
Parse()409 std::shared_ptr<TensorOperation> WordpieceTokenizer::Parse() {
410   return std::make_shared<WordpieceTokenizerOperation>(
411     data_->vocab_, data_->suffix_indicator_, data_->max_bytes_per_token_, data_->unknown_token_, data_->with_offsets_);
412 }
413 
414 #ifndef _WIN32
415 // UnicodeScriptTokenizer
416 struct UnicodeScriptTokenizer::Data {
Datamindspore::dataset::text::UnicodeScriptTokenizer::Data417   Data(bool keep_whitespace, bool with_offsets) : keep_whitespace_(keep_whitespace), with_offsets_(with_offsets) {}
418   bool keep_whitespace_;
419   bool with_offsets_;
420 };
421 
UnicodeScriptTokenizer(bool keep_whitespace,bool with_offsets)422 UnicodeScriptTokenizer::UnicodeScriptTokenizer(bool keep_whitespace, bool with_offsets)
423     : data_(std::make_shared<Data>(keep_whitespace, with_offsets)) {}
424 
Parse()425 std::shared_ptr<TensorOperation> UnicodeScriptTokenizer::Parse() {
426   return std::make_shared<UnicodeScriptTokenizerOperation>(data_->keep_whitespace_, data_->with_offsets_);
427 }
428 
429 // WhitespaceTokenizer
430 struct WhitespaceTokenizer::Data {
Datamindspore::dataset::text::WhitespaceTokenizer::Data431   explicit Data(bool with_offsets) : with_offsets_(with_offsets) {}
432   bool with_offsets_;
433 };
434 
WhitespaceTokenizer(bool with_offsets)435 WhitespaceTokenizer::WhitespaceTokenizer(bool with_offsets) : data_(std::make_shared<Data>(with_offsets)) {}
436 
Parse()437 std::shared_ptr<TensorOperation> WhitespaceTokenizer::Parse() {
438   return std::make_shared<WhitespaceTokenizerOperation>(data_->with_offsets_);
439 }
440 #endif
441 }  // namespace text
442 }  // namespace dataset
443 }  // namespace mindspore
444