1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ 18 19 #include <deque> 20 #include <memory> 21 #include <string> 22 #include <vector> 23 #include "./securec.h" 24 #ifndef ENABLE_ANDROID 25 #include "utils/log_adapter.h" 26 #else 27 #include "mindspore/lite/src/common/log_adapter.h" 28 #endif 29 #if defined(_WIN32) || defined(_WIN64) 30 #undef HAVE_STDDEF_H 31 #undef HAVE_STDLIB_H 32 #endif 33 34 #ifdef ENABLE_PYTHON 35 #include "pybind11/numpy.h" 36 #include "pybind11/pybind11.h" 37 #include "pybind11/stl.h" 38 #endif 39 40 #include "minddata/dataset/include/dataset/constants.h" 41 #include "minddata/dataset/core/data_type.h" 42 #include "minddata/dataset/core/tensor_helpers.h" 43 #include "minddata/dataset/core/tensor_shape.h" 44 #include "minddata/dataset/core/de_tensor.h" 45 #include "minddata/dataset/util/status.h" 46 #include "utils/ms_utils.h" 47 #ifndef ENABLE_ANDROID 48 #include "proto/example.pb.h" 49 #endif 50 51 #ifdef ENABLE_PYTHON 52 namespace py = pybind11; 53 #endif 54 namespace mindspore { 55 namespace dataset { 56 class Tensor; 57 template <typename T> 58 class Allocator; 59 60 using CharAllocPtr = std::unique_ptr<Allocator<unsigned char>>; 61 using TensorAllocPtr = std::shared_ptr<Allocator<Tensor>>; // An allocator shared_ptr for Tensors 62 using offset_t = uint32_t; // type of offset values to store strings locations 63 using TensorPtr = std::shared_ptr<Tensor>; 64 65 class Tensor { 66 public: 67 Tensor() = delete; 68 Tensor(const Tensor &other) = delete; 69 Tensor &operator=(const Tensor &other) = delete; 70 71 /// Create a tensor using shape and type. This constructor should not be used directly, use CreateFromTensor instead. 72 /// \note The shape and type information should be known and valid 73 /// \note The constructor does not allocate data 74 /// \param shape TensorShape 75 /// \param type DataType 76 Tensor(const TensorShape &shape, const DataType &type); 77 78 /// Move constructor 79 /// \param other Tensor to be moved 80 Tensor(Tensor &&other) noexcept; 81 82 /// Move assignment operator 83 /// \param other Tensor to be moved 84 Tensor &operator=(Tensor &&other) noexcept; 85 86 /// Create a numeric tensor with type and shape. Items of the tensor would be uninitialized. 87 /// \param[in] shape shape of the output tensor 88 /// \param[in] type type of the output tensor 89 /// \param[out] out Generated tensor 90 /// \return Status code 91 static Status CreateEmpty(const TensorShape &shape, const DataType &type, TensorPtr *out); 92 93 /// Create a numeric tensor from a pointer in memory. Length of the source data is determined from the shape and type. 94 /// Data will be copied into the new created tensor. 95 /// \param[in] shape shape of the output tensor 96 /// \param[in] type type of the output tensor 97 /// \param[in] src pointer to the source data 98 /// \param[out] out Generated tensor 99 /// \return Status code 100 static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, TensorPtr *out); 101 102 /// Create a tensor from a pointer in memory and length. Data will be copied into the new created tensor. 103 /// \param[in] shape shape of the output tensor 104 /// \param[in] type type of the output tensor 105 /// \param[in] src pointer to the source data 106 /// \param[in] length length of the src data 107 /// \param[out] out Generated tensor 108 /// \return Status code 109 static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, 110 const dsize_t &length, TensorPtr *out); 111 112 /// Create a copy of the input tensor 113 /// \param[in] in original tensor to be copied 114 /// \param[out] out output tensor to be generated 115 /// \return Status CreateFromTensor(const TensorPtr & in,TensorPtr * out)116 static Status CreateFromTensor(const TensorPtr &in, TensorPtr *out) { 117 return CreateFromMemory(in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes(), out); 118 } 119 120 /// Create a copy of the input tensor 121 /// \param[in] MSTensor to create DETensorFrom 122 /// \return Status 123 static Status CreateFromMSTensor(const MSTensor &in, TensorPtr *out); 124 125 #ifdef ENABLE_PYTHON 126 /// Create a Tensor from a given py::array 127 /// \param[in] arr py::array 128 /// \param[out] out Created tensor 129 /// \return Status Code 130 static Status CreateFromNpArray(const py::array &arr, TensorPtr *out); 131 #endif 132 133 #ifndef ENABLE_ANDROID 134 /// Create a tensor of type DE_STRING from a BytesList. 135 /// \param[in] bytes_list protobuf's Bytelist 136 /// \param[in] shape shape of the output tensor 137 /// \param[out] out created Tensor 138 /// \return Status Code 139 static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, TensorPtr *out); 140 141 /// Create a tensor of type UINT8 or INT8 from a BytesList. 142 /// The tensor will be padded with ' ' to reach the required pad_size. 143 /// \param[in] bytes_list protobuf's Bytelist 144 /// \param[in] shape shape of the output tensor 145 /// \param[in] type type of created tensor. Should be DE_UINT8 or INT8 146 /// \param[in] pad_size The size of the tensor after padding 147 /// \param[out] out created Tensor 148 /// \return Status Code 149 static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, 150 const DataType &type, dsize_t pad_size, TensorPtr *out); 151 #endif 152 153 /// Create a Tensor from a given list of values. 154 /// \tparam type of the values to be inserted. 155 /// \param[in] items elements of the tensor 156 /// \param[in] shape shape of the output tensor 157 /// \param[out] out output argument to hold the created Tensor 158 /// \return Status Code 159 template <typename T> CreateFromVector(const std::vector<T> & items,const TensorShape & shape,TensorPtr * out)160 static Status CreateFromVector(const std::vector<T> &items, const TensorShape &shape, TensorPtr *out) { 161 CHECK_FAIL_RETURN_UNEXPECTED( 162 static_cast<dsize_t>(items.size()) == shape.NumOfElements(), 163 "Number of elements in the vector does not match the number of elements of the shape required"); 164 DataType type = DataType::FromCType<T>(); 165 // if items is empty, items_ptr would be nullptr. CreateFromMemory will handle this case. 166 auto items_ptr = reinterpret_cast<const uchar *>(&items[0]); 167 return CreateFromMemory(shape, type, items_ptr, out); 168 } 169 170 /// Create a 1D Tensor from a given list of values. 171 /// \tparam type of the values to be inserted. 172 /// \param[in] items elements of the tensor 173 /// \param[out] out output argument to hold the created Tensor 174 /// \return Status Code 175 template <typename T> CreateFromVector(const std::vector<T> & items,TensorPtr * out)176 static Status CreateFromVector(const std::vector<T> &items, TensorPtr *out) { 177 return CreateFromVector(items, TensorShape({static_cast<dsize_t>(items.size())}), out); 178 } 179 180 /// Create a 1D boolean Tensor from a given list of boolean values. 181 /// \param[in] items elements of the tensor 182 /// \param[in] shape shape of the output tensor 183 /// \param[out] out output argument to hold the created Tensor 184 /// \return Status Code CreateFromVector(const std::vector<bool> & items,const TensorShape & shape,TensorPtr * out)185 static Status CreateFromVector(const std::vector<bool> &items, const TensorShape &shape, TensorPtr *out) { 186 std::vector<uint8_t> temp(items.begin(), items.end()); 187 RETURN_IF_NOT_OK(CreateFromVector(temp, shape, out)); 188 (*out)->type_ = DataType(DataType::DE_BOOL); 189 return Status::OK(); 190 } 191 192 /// Create a numeric scalar Tensor from the given value. 193 /// \tparam T type of value 194 /// \param[in] item value 195 /// \param[out] out Created tensor 196 /// \return Status code 197 template <typename T> CreateScalar(const T & item,TensorPtr * out)198 static Status CreateScalar(const T &item, TensorPtr *out) { 199 DataType type = DataType::FromCType<T>(); 200 auto item_ptr = reinterpret_cast<const uchar *>(&item); 201 return CreateFromMemory(TensorShape::CreateScalar(), type, item_ptr, out); 202 } 203 204 /// Create a tensor from a binary file on disk. 205 /// \param[in] path file to be read 206 /// \param[out] out Created Tensor 207 /// \return Status code 208 static Status CreateFromFile(const std::string &path, TensorPtr *out); 209 210 /// Destruct the tensor and release the memory using the allocator 211 virtual ~Tensor(); 212 213 /// Equality operator. compares tensor shape, type and data 214 /// \param[in] rhs Tensor to be compared with 215 /// \return bool 216 bool operator==(const Tensor &rhs) const; 217 218 bool operator!=(const Tensor &rhs) const { return !((*this) == rhs); } 219 220 Status to_json(nlohmann::json *out_json); 221 222 template <typename T> 223 Status to_json_convert(nlohmann::json *args); 224 225 static Status from_json(nlohmann::json op_params, std::shared_ptr<Tensor> *tensor); 226 227 template <typename T> 228 static Status from_json_convert(nlohmann::json json_data, TensorShape shape, std::shared_ptr<Tensor> *tensor); 229 230 /// Get item located at `index`, caller needs to provide the type. 231 /// \tparam T 232 /// \param[in] index vector<dsize_t> 233 /// \return return the item specified at index 234 template <typename T> 235 Status GetItemAt(T *o, const std::vector<dsize_t> &index) const; 236 237 /// Get string located at `index`. 238 /// \param[in] index vector<dsize_t> 239 /// \return return std::string_view specified at index 240 Status GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) const; 241 242 template <typename T> 243 Status GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const; 244 245 template <typename T> 246 Status GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const; 247 248 template <typename T> 249 Status GetFloatAt(T *o, const std::vector<dsize_t> &index) const; 250 251 /// set item at location specified by index 252 /// \tparam `T` 253 /// \param[in] index 254 /// \param[in] value of type `T` 255 template <typename T> SetItemAt(const std::vector<dsize_t> & index,const T & value)256 Status SetItemAt(const std::vector<dsize_t> &index, const T &value) { 257 T *ptr = nullptr; 258 RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index)); 259 *ptr = value; 260 return Status::OK(); 261 } 262 263 /// set string item at location specified by index 264 /// \param[in] index 265 /// \param[in] value of type std::string SetItemAt(const std::vector<dsize_t> & index,const std::string & value)266 Status SetItemAt(const std::vector<dsize_t> &index, const std::string &value) { 267 RETURN_UNEXPECTED_IF_NULL(data_); 268 uchar *ptr = nullptr; 269 offset_t length = 0; 270 RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length)); 271 if (value.length() != length) { 272 RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item."); 273 } 274 int ret_code = memcpy_s(reinterpret_cast<char *>(ptr), length, value.c_str(), length); 275 CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to set data into tensor."); 276 277 return Status::OK(); 278 } 279 280 /// fill tensor with Zeros. Does not support strings. Zero()281 Status Zero() { 282 CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use Zero on tensor of strings.."); 283 dsize_t size = SizeInBytes(); 284 CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0, 285 "Failed to fill tensor with zeroes."); 286 return Status::OK(); 287 } 288 289 /// Fill all elements in the Tensor with the given value of type `T`. Does not support strings. 290 /// \tparam T 291 /// \param value[in] 292 template <typename T> Fill(const T & value)293 Status Fill(const T &value) { 294 CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings."); 295 int64_t cellSize = type_.SizeInBytes(); 296 if ((data_ != nullptr) && type_.IsCompatible<T>()) { 297 for (dsize_t i = 0; i < Size(); i++) { 298 CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s((data_ + i * cellSize), cellSize, &value, cellSize) == 0, "memcpy err"); 299 } 300 return Status::OK(); 301 } else { 302 std::string err; 303 err += (data_ == nullptr) ? "data_ is nullptr \t" : ""; 304 err += type_.IsCompatible<T>() ? "data type not compatible\t" : ""; 305 return Status(StatusCode::kMDUnexpectedError, err); 306 } 307 } 308 309 /// Getter function for shape 310 /// \return shape()311 const TensorShape &shape() const { return shape_; } 312 313 /// Check if tensor has data 314 /// \return bool - true if tensor is not empty HasData()315 bool HasData() const { return data_ != nullptr; } 316 317 /// Check if tensor is complex 318 /// \return bool - true if tensor is complex IsComplex()319 bool IsComplex() const { 320 // check the last dim all be 2 321 return shape_[-1] == 2; 322 } 323 324 /// Reshape the tensor. The given shape should have the same number of elements in the Tensor 325 /// \param shape 326 virtual Status Reshape(const TensorShape &shape); 327 328 /// \return number of elements in this tensor Size()329 dsize_t Size() const { return shape().NumOfElements(); } 330 331 /// \return the number of bytes this tensor is needs SizeInBytes()332 dsize_t SizeInBytes() const { 333 if (data_end_ == nullptr) return type_.SizeInBytes() * shape_.NumOfElements(); 334 return data_end_ - data_; 335 } 336 337 /// \return the rank of the tensor Rank()338 dsize_t Rank() const { return shape().Rank(); } 339 340 /// Get the starting memory address as a constant for the data of the tensor. This potentially 341 /// drives an allocation if the data area. 342 /// \return const unsigned char* GetBuffer()343 const unsigned char *GetBuffer() const { return data_; } 344 345 /// Getter of the type 346 /// \return type()347 DataType type() const { return type_; } 348 349 /// Provide stream operator for displaying it 350 /// \param output stream 351 /// \param so the Tensor object to be printed 352 /// \return output stream 353 friend std::ostream &operator<<(std::ostream &out, const Tensor &so) { 354 so.Print(out); 355 return out; 356 } 357 358 /// Invalidate this Tensor by setting the type and shape to unknown and MData to null. 359 /// Calling this method will make the Tensor and its data inaccessible, use it with caution. 360 void Invalidate(); 361 362 /// Copy input tensor into self at the location index. 363 /// Index is a vector of axes which can be incomplete: 364 /// Ex: shape <2,3>, inserting into index {0} will replace the first row. index {1,2} will replace the last cell. 365 /// \param index 366 /// \param input 367 /// \param partial_insert: boolean to determine if insertion along the full axis is enforced 368 /// \return Status code 369 Status InsertTensor(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input, 370 const bool partial_insert = false); 371 372 /// Find the address of the given index. Used in InsertTensor. 373 /// Example: 374 /// Tensor t= [[1,2],[3,4]] , StartAddrOfIndex({0}) -> &1 375 /// \param index incomplete index 376 /// \param output: startAddrofIndex 377 /// \param output: remaining 378 /// \return Status code 379 Status StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_index, TensorShape *remaining); 380 381 /// Expand the shape of the Tensor with one extra dimension. 382 /// For example, if the shape is <512,512,3>: 383 /// *- ExpandDim(0) gives: <1,512,512,3> 384 /// *- ExpandDim(1) gives: <512,1,512,3> 385 /// *- ExpandDim(3) gives: <512,512,3,1> 386 /// \param axis location of the dim 387 virtual Status ExpandDim(const dsize_t &axis); 388 389 virtual void Squeeze(); 390 391 /// Calculates the strides of the Tensor 392 /// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte) 393 /// The strides will be {6,2,1}. 394 /// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte) 395 /// The strides will be {24,8,4}. 396 /// \return vector of integers 397 std::vector<dsize_t> Strides() const; 398 ToString()399 std::string ToString() { 400 std::stringstream ss; 401 this->Print(ss); 402 return ss.str(); 403 } 404 405 /// Handle negative indices. 406 /// \param[out] out modified index 407 /// \param[in] index 408 /// \param[in] length axis length used to modify index 409 /// \return dsize_t modified index HandleNeg(dsize_t index,dsize_t length)410 static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; } 411 412 /// Handle negative indices for a vector of indices. 413 /// \param[out] out modified vector of indices 414 /// \param[in] index_vector vector of indices 415 /// \return std::vector<dsize_t> modified vector of indices HandleNegIndices(std::vector<dsize_t> index_vector,std::vector<dsize_t> length)416 static inline std::vector<dsize_t> HandleNegIndices(std::vector<dsize_t> index_vector, std::vector<dsize_t> length) { 417 if (length.size() < index_vector.size()) { 418 MS_LOG(ERROR) << "The size of length should be greater than the shape of index_vector"; 419 return {}; 420 } 421 std::vector<dsize_t> indices(index_vector.size(), 0); 422 for (size_t i = 0; i < index_vector.size(); i++) { 423 indices[i] = HandleNeg(index_vector[i], length[i]); 424 } 425 return indices; 426 } 427 428 /// Slice tensor bases on the given indices. Copy the sliced data into out tensor. 429 /// Based on the type of tensor, SliceNumeric or SliceString will be called 430 /// \param[out] out Tensor 431 /// \param[in] slice_options vector of SliceOption objects 432 /// \return Status error code 433 Status Slice(TensorPtr *out, const std::vector<mindspore::dataset::SliceOption> slice_options); 434 435 /// Get slice_option according to shape and index. 436 /// \param[in] slice_option input SliceOption object 437 /// \param[in] slice_index index of SliceOption object 438 /// \param[out] output slice_option with shape info 439 /// \return Status error code 440 Status GetSliceOption(const SliceOption &slice_option, const int32_t &slice_index, SliceOption *slice_option_ptr); 441 442 #ifdef ENABLE_PYTHON 443 /// Constructs numpy array from input tensor 444 /// \param[in] data this data is the location of python data 445 /// \return Status code 446 Status GetDataAsNumpy(py::array *data); 447 448 Status GetDataAsNumpyStrings(py::array *data); 449 450 static Status GetBufferInfo(Tensor *t, py::buffer_info *out); 451 #endif 452 SetYuvShape(const uint32_t & width,const uint32_t & widthStride,const uint32_t & height,const uint32_t & heightStride)453 Status SetYuvShape(const uint32_t &width, const uint32_t &widthStride, const uint32_t &height, 454 const uint32_t &heightStride) { 455 std::vector<uint32_t> tmp{width, widthStride, height, heightStride}; 456 yuv_shape_ = tmp; 457 return Status::OK(); 458 } 459 GetYuvShape()460 std::vector<uint32_t> GetYuvShape() { return yuv_shape_; } 461 462 /// TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor 463 /// The order elements is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6 464 /// \tparam T type of values in the Tensor Iterator 465 template <typename T, bool = true> 466 class TensorIterator { 467 public: 468 using iterator_category = std::random_access_iterator_tag; 469 using value_type = T; 470 using difference_type = ptrdiff_t; 471 using pointer = T *; 472 using reference = T &; 473 474 explicit TensorIterator(uchar *ptr = nullptr) { ptr_ = reinterpret_cast<T *>(ptr); } 475 TensorIterator(const TensorIterator<T> & raw_iterator)476 TensorIterator(const TensorIterator<T> &raw_iterator) { ptr_ = raw_iterator.ptr_; } 477 478 ~TensorIterator() = default; 479 480 TensorIterator<T> &operator=(const TensorIterator<T> &rhs) { 481 ptr_ = rhs.ptr_; 482 return *this; 483 } 484 485 TensorIterator<T> &operator=(T *rhs) { 486 ptr_ = rhs; 487 return *this; 488 } 489 490 bool operator==(const TensorIterator<T> &rhs) { return ptr_ == rhs.ptr_; } 491 492 bool operator!=(const TensorIterator<T> &rhs) { return !(*this == rhs); } 493 494 operator bool() const { return ptr_ != nullptr; } 495 496 T &operator*() { return *ptr_; } 497 498 const T &operator*() const { return *ptr_; } 499 500 T *operator->() { return ptr_; } 501 502 TensorIterator<T> &operator+=(const ptrdiff_t &inc) { 503 ptr_ += inc; 504 return *this; 505 } 506 507 TensorIterator<T> &operator-=(const ptrdiff_t &inc) { 508 ptr_ -= inc; 509 return *this; 510 } 511 512 TensorIterator<T> &operator++() { 513 ++ptr_; 514 return *this; 515 } 516 517 TensorIterator<T> &operator--() { 518 --ptr_; 519 return *this; 520 } 521 522 TensorIterator<T> operator++(int) { 523 auto temp(*this); 524 ++ptr_; 525 return temp; 526 } 527 528 TensorIterator<T> operator--(int) { 529 auto temp(*this); 530 --ptr_; 531 return temp; 532 } 533 534 TensorIterator<T> operator+(const ptrdiff_t &inc) { 535 auto oldPtr = ptr_; 536 ptr_ += inc; 537 auto temp(*this); 538 ptr_ = oldPtr; 539 return temp; 540 } 541 542 TensorIterator<T> operator-(const ptrdiff_t &inc) { 543 auto oldPtr = ptr_; 544 ptr_ -= inc; 545 auto temp(*this); 546 ptr_ = oldPtr; 547 return temp; 548 } 549 550 protected: 551 T *ptr_; 552 }; 553 554 // Specialization of TensorIterator for strings. It returns std::string_view for every item. 555 // \tparam DUMMY, used to mbe able to specialize the inner class 556 template <bool DUMMY> 557 class TensorIterator<std::string_view, DUMMY> { 558 public: 559 using iterator_category = std::random_access_iterator_tag; 560 using value_type = std::string_view; 561 using difference_type = ptrdiff_t; 562 using pointer = std::string_view *; 563 using reference = std::string_view &; 564 565 explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) { 566 data_ = reinterpret_cast<const char *>(data); 567 index_ = index; 568 } 569 TensorIterator(const TensorIterator<std::string_view,DUMMY> & raw_iterator)570 TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) { 571 data_ = raw_iterator.data_; 572 index_ = raw_iterator.index_; 573 } 574 575 ~TensorIterator() = default; 576 577 bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; } 578 579 bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); } 580 581 operator bool() const { return data_ != nullptr; } 582 583 std::string_view operator*() const { 584 auto offset_ = reinterpret_cast<const offset_t *>(data_); 585 offset_t start = offset_[index_]; 586 return std::string_view{data_ + start}; 587 } 588 589 TensorIterator<std::string_view> &operator+=(const dsize_t &inc) { 590 index_ += inc; 591 return *this; 592 } 593 594 TensorIterator<std::string_view> &operator-=(const dsize_t &inc) { 595 index_ -= inc; 596 return *this; 597 } 598 599 TensorIterator<std::string_view> &operator++() { 600 ++index_; 601 return *this; 602 } 603 604 TensorIterator<std::string_view> &operator--() { 605 --index_; 606 return *this; 607 } 608 609 TensorIterator<std::string_view> operator++(int) { 610 auto temp(*this); 611 ++index_; 612 return temp; 613 } 614 615 TensorIterator<std::string_view> operator--(int) { 616 auto temp(*this); 617 --index_; 618 return temp; 619 } 620 621 TensorIterator<std::string_view> operator+(const dsize_t &inc) { 622 auto oldPtr = index_; 623 index_ += inc; 624 auto temp(*this); 625 index_ = oldPtr; 626 return temp; 627 } 628 629 TensorIterator<std::string_view> operator-(const dsize_t &inc) { 630 auto oldPtr = index_; 631 index_ -= inc; 632 auto temp(*this); 633 index_ = oldPtr; 634 return temp; 635 } 636 637 protected: 638 dsize_t index_; 639 const char *data_; 640 }; 641 642 /// Return a TensorIterator that points to the start of the Tensor. 643 /// It's the user responsibility to use the correct type that matches the Tensor type 644 /// \tparam T The type of values in the Tensor 645 /// \return TensorIterator 646 template <typename T> begin()647 TensorIterator<T> begin() { 648 return TensorIterator<T>(data_); 649 } 650 651 /// Return a linear iterator that points to the place after the last element of the Tensor. 652 /// \tparam T The type of values in the Tensor 653 /// \return TensorIterator 654 template <typename T> end()655 TensorIterator<T> end() { 656 return TensorIterator<T>(data_end_); 657 } 658 659 /// Copies the last dimension at `index` from Tensor `src` to this Tensor. 660 /// \param[in] src Tensor 661 /// \param[in] index vector to the start of the dimension. The last dim should be 0 662 /// \return Status 663 Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index); 664 665 protected: 666 /// Allocate memory for the tensor using the data_allocator 667 /// \param[in] length number of bytes to be allocated 668 /// \return Error Status 669 Status AllocateBuffer(const dsize_t &length); 670 671 /// Get the starting memory address for the data of the tensor. This potentially 672 /// drives an allocation if the data is null. 673 /// \return unsigned char* GetMutableBuffer()674 unsigned char *GetMutableBuffer() { return data_; } 675 676 /// A function that prints Tensor recursively, first called by print 677 /// \param[in] out 678 /// \param[in] cur_dim 679 /// \param[in] cur_index 680 void PrintRecursive(std::ostream &out, int32_t cur_dim, const std::vector<dsize_t> &cur_index) const; 681 682 /// A function that prints info about the tensor 683 /// \param[out] out output stream 684 void Print(std::ostream &out) const; 685 686 /// A function that prints info about the tensor 687 /// \param[out] out output stream 688 void PrintData(std::ostream &out) const; 689 690 /// A function that print the value as specified by its index 691 /// \param[in] index vector representing the index 692 /// \param[out] out 693 void PrintItemAt(const std::vector<dsize_t> &index, std::ostream &out) const; 694 695 /// Get pointer to item located at `index`, caller needs to provide the type. 696 /// \tparam T 697 /// \param[in] index vector<dsize_t> 698 /// \return return a pointer to the item specified at index of type `T` 699 template <typename T> 700 Status GetItemPtr(T **, const std::vector<dsize_t> &index) const; 701 702 /// Get pointer to string located at `index` and the length of string 703 /// \param[in] index vector<dsize_t> 704 /// \return return a pointer to the string specified at index and the length of the string 705 Status GetItemPtr(uchar **, const std::vector<dsize_t> &index, offset_t *length = nullptr) const; 706 707 /// Given a flat index of an item string, return the start and length of the item 708 /// \param[in] index flat index of the item 709 /// \param[out] start address of the ths string 710 /// \param[out] length of the string 711 Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const; 712 713 /// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if 714 /// the tensor's type is a string, otherwise undefined address would be returned. \return address of the first string 715 /// of the tensor. GetStringsBuffer()716 uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; } 717 718 /// all access to shape_ should be via shape 719 TensorShape shape_; 720 /// data type of tensor 721 DataType type_; 722 /// pointer to the start of the physical data 723 unsigned char *data_; 724 /// An allocator for data_ 725 CharAllocPtr data_allocator_; 726 /// pointer to the end of the physical data 727 unsigned char *data_end_ = nullptr; 728 729 /// shape for interpretation of YUV image 730 std::vector<uint32_t> yuv_shape_; 731 732 private: 733 friend class DETensor; 734 735 /// Slice numeric tensors. 736 Status SliceNumeric(TensorPtr *out, const std::vector<std::vector<dsize_t>> &indices, const TensorShape &shape); 737 738 /// Slice string tensors 739 Status SliceString(TensorPtr *out, const std::vector<std::vector<dsize_t>> &indices, const TensorShape &shape); 740 741 /// Copy raw data of a array based on shape and strides to the destination pointer 742 /// \param dst [out] Pointer to the destination array where the content is to be copied 743 /// \param[in] src Pointer to the source of strided array to be copied 744 /// \param[in] shape shape of the source array 745 /// \param[in] strides strides of the source array 746 /// \param[in] type_size number of bytes needed to store one array element's type 747 /// \return Status Code 748 static Status CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector<dsize_t> shape, 749 std::vector<dsize_t> strides, uint8_t type_size); 750 751 /// const of the size of the offset variable 752 static constexpr uint8_t kOffsetSize = sizeof(offset_t); 753 754 #ifdef ENABLE_PYTHON 755 /// Helper function to create a tensor from Numpy array of strings 756 /// \param[in] arr Numpy array 757 /// \param[out] out Created Tensor 758 /// \return Status 759 static Status CreateFromNpString(py::array arr, TensorPtr *out); 760 #endif 761 }; 762 template <> 763 inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() { 764 return TensorIterator<std::string_view>(data_, shape_.NumOfElements()); 765 } 766 767 /// Create a Tensor from a given list of strings. 768 /// @note: The memory layout of a Tensor of strings consists of the Offset_array followed by the strings. 769 /// The offset array will store one extra value to find the length of the last string. 770 /// OFFSET_1, OFFSET_2, ..., OFFSET_n+1, STRING_1, STRING_2, ..., STRING_n 771 /// The value of each offset is the start index of the corresponding string 772 /// Offsets is of type offset_t 773 /// strings will ne null-terminated 774 /// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING) 775 /// |----------------------------------------------------------------| 776 /// | OFFSET ARRAY | STRINGS | 777 /// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 | 778 /// | 11 | 15 | 18 | abc\0 | de\0 | 779 /// |----------------------------------------------------------------| 780 /// \param[in] items elements of the tensor 781 /// \param[in] shape shape of the output tensor 782 /// \param[out] out output argument to hold the created Tensor 783 /// \return Status Code 784 template <> 785 inline Status Tensor::CreateFromVector<std::string>(const std::vector<std::string> &items, const TensorShape &shape, 786 TensorPtr *out) { 787 RETURN_UNEXPECTED_IF_NULL(out); 788 CHECK_FAIL_RETURN_UNEXPECTED( 789 static_cast<dsize_t>(items.size()) == shape.NumOfElements(), 790 "Number of elements in the vector does not match the number of elements of the shape required"); 791 const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator(); 792 *out = std::allocate_shared<Tensor>(*alloc, TensorShape({static_cast<dsize_t>(items.size())}), 793 DataType(DataType::DE_STRING)); 794 CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory failed."); 795 if (items.size() == 0) { 796 if (shape.known()) { 797 return (*out)->Reshape(shape); 798 } 799 } 800 auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; }; 801 dsize_t total_length = std::accumulate(items.begin(), items.end(), 0, length_sum); 802 803 // total bytes needed = offset array + strings 804 // offset array needs to store one offset var per element + 1 extra to get the length of the last string. 805 // strings will be null-terminated --> need 1 extra byte per element 806 dsize_t num_bytes = (kOffsetSize + 1) * (*out)->shape_.NumOfElements() + kOffsetSize + total_length; 807 808 RETURN_IF_NOT_OK((*out)->AllocateBuffer(num_bytes)); 809 auto offset_arr = reinterpret_cast<offset_t *>((*out)->data_); 810 uchar *buf = (*out)->GetStringsBuffer(); 811 812 offset_t offset = buf - (*out)->data_; // the first string will start here 813 uint32_t i = 0; 814 for (const auto &str : items) { 815 // insert the start index of the string. 816 offset_arr[i++] = offset; 817 // total bytes are reduced by kOffsetSize 818 num_bytes -= kOffsetSize; 819 // insert actual string 820 int ret_code = memcpy_s((*out)->data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1); 821 if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor"; 822 // next string will be stored right after the current one. 823 offset = offset + str.length() + 1; 824 // total bytes are reduced by the length of the string 825 num_bytes -= str.length() + 1; 826 } 827 // store one more offset value so we can get the length of the last string 828 offset_arr[i] = offset; 829 830 (*out)->data_end_ = (*out)->data_ + offset_arr[i]; 831 832 MS_ASSERT(num_bytes == 0); 833 if (shape.known()) { 834 RETURN_IF_NOT_OK((*out)->Reshape(shape)); 835 } 836 return Status::OK(); 837 } 838 /// Create a string scalar Tensor from the given value. 839 /// \param[in] item value 840 /// \param[out] out Created tensor 841 /// \return Status code 842 template <> 843 inline Status Tensor::CreateScalar<std::string>(const std::string &item, TensorPtr *out) { 844 RETURN_UNEXPECTED_IF_NULL(out); 845 return CreateFromVector<std::string>({item}, TensorShape::CreateScalar(), out); 846 } 847 } // namespace dataset 848 } // namespace mindspore 849 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ 850