1 /** 2 * Copyright 2020-2024 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ 17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ 18 19 #include <algorithm> 20 #include <memory> 21 #include <string> 22 #include <utility> 23 #include <vector> 24 #if defined(_WIN32) || defined(_WIN64) 25 #undef HAVE_STDDEF_H 26 #undef HAVE_STDLIB_H 27 #endif 28 29 #include "securec.h" 30 #ifndef ENABLE_ANDROID 31 #include "proto/example.pb.h" 32 #endif 33 #ifdef ENABLE_PYTHON 34 #include "pybind11/numpy.h" 35 #include "pybind11/pybind11.h" 36 #include "pybind11/stl.h" 37 #endif 38 39 #include "minddata/dataset/core/data_type.h" 40 #include "minddata/dataset/core/de_tensor.h" 41 #include "minddata/dataset/core/tensor_helpers.h" 42 #include "minddata/dataset/core/tensor_shape.h" 43 #include "minddata/dataset/include/dataset/constants.h" 44 #include "minddata/dataset/util/log_adapter.h" 45 #include "minddata/dataset/util/status.h" 46 #include "utils/ms_utils.h" 47 48 #ifdef ENABLE_PYTHON 49 namespace py = pybind11; 50 #endif 51 52 namespace mindspore::dataset { 53 class Tensor; 54 template <typename T> 55 class Allocator; 56 57 using offset_t = uint32_t; // type of offset values to store strings locations 58 using TensorPtr = std::shared_ptr<Tensor>; 59 60 /// const of the size of the offset variable 61 constexpr uint8_t kOffsetSize = sizeof(offset_t); 62 63 class DATASET_API Tensor { 64 public: 65 Tensor() = delete; 66 Tensor(const Tensor &other) = delete; 67 Tensor &operator=(const Tensor &other) = delete; 68 69 /// Create a tensor using shape and type. This constructor should not be used directly, use CreateFromTensor instead. 70 /// \note The shape and type information should be known and valid 71 /// \note The constructor does not allocate data 72 /// \param shape TensorShape 73 /// \param type DataType 74 Tensor(TensorShape shape, DataType type); 75 76 /// Move constructor 77 /// \param other Tensor to be moved 78 Tensor(Tensor &&other) noexcept; 79 80 /// Move assignment operator 81 /// \param other Tensor to be moved 82 Tensor &operator=(Tensor &&other) noexcept; 83 84 /// Create a numeric tensor with type and shape. Items of the tensor would be uninitialized. 85 /// \param[in] shape shape of the output tensor 86 /// \param[in] type type of the output tensor 87 /// \param[out] out Generated tensor 88 /// \return Status code 89 static Status CreateEmpty(const TensorShape &shape, const DataType &type, TensorPtr *out); 90 91 /// Create a numeric tensor from a pointer in memory. Length of the source data is determined from the shape and type. 92 /// Data will be copied into the new created tensor. 93 /// \param[in] shape shape of the output tensor 94 /// \param[in] type type of the output tensor 95 /// \param[in] src pointer to the source data 96 /// \param[out] out Generated tensor 97 /// \return Status code 98 static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, TensorPtr *out); 99 100 /// Create a tensor from a pointer in memory and length. Data will be copied into the new created tensor. 101 /// \param[in] shape shape of the output tensor 102 /// \param[in] type type of the output tensor 103 /// \param[in] src pointer to the source data 104 /// \param[in] length length of the src data 105 /// \param[out] out Generated tensor 106 /// \return Status code 107 static Status CreateFromMemory(const TensorShape &shape, const DataType &type, const uchar *src, 108 const dsize_t &length, TensorPtr *out); 109 110 /// Create a copy of the input tensor 111 /// \param[in] in original tensor to be copied 112 /// \param[out] out output tensor to be generated 113 /// \return Status CreateFromTensor(const TensorPtr & in,TensorPtr * out)114 static Status CreateFromTensor(const TensorPtr &in, TensorPtr *out) { 115 return CreateFromMemory(in->shape(), in->type(), in->GetBuffer(), in->SizeInBytes(), out); 116 } 117 118 /// Create a copy of the input tensor 119 /// \param[in] in MSTensor to create DETensor from. 120 /// \param[in] out DETensor created. 121 /// \return Status 122 static Status CreateFromMSTensor(const MSTensor &in, TensorPtr *out); 123 124 #ifdef ENABLE_PYTHON 125 /// Create a Tensor from a given py::array and reuse the memory of numpy 126 /// \param[in] arr py::array 127 /// \param[out] out Created tensor 128 /// \return Status Code 129 static Status CreateFromNpArray(py::array arr, TensorPtr *out); 130 131 /// Helper function to create a tensor from a Python dictionary object 132 /// \param[in] obj pybind11 wrapper for Python dictionary object 133 /// \param[out] out Created Tensor 134 /// \return Status 135 static Status CreateFromPythonObject(py::object obj, TensorPtr *out); 136 #endif 137 138 #ifndef ENABLE_ANDROID 139 /// Create a tensor of type DE_STRING from a BytesList. 140 /// \param[in] bytes_list protobuf's Bytelist 141 /// \param[in] shape shape of the output tensor 142 /// \param[out] out created Tensor 143 /// \return Status Code 144 static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, TensorPtr *out); 145 146 /// Create a tensor of type UINT8 or INT8 from a BytesList. 147 /// The tensor will be padded with ' ' to reach the required pad_size. 148 /// \param[in] bytes_list protobuf's Bytelist 149 /// \param[in] shape shape of the output tensor 150 /// \param[in] type type of created tensor. Should be DE_UINT8 or INT8 151 /// \param[in] pad_size The size of the tensor after padding 152 /// \param[out] out created Tensor 153 /// \return Status Code 154 static Status CreateFromByteList(const dataengine::BytesList &bytes_list, const TensorShape &shape, 155 const DataType &type, dsize_t pad_size, TensorPtr *out); 156 #endif 157 158 /// Create a Tensor from a given list of values. 159 /// \param[in] items elements of the tensor 160 /// \param[in] shape shape of the output tensor 161 /// \param[out] out output argument to hold the created Tensor 162 /// \return Status Code 163 template <typename T> CreateFromVector(const std::vector<T> & items,const TensorShape & shape,TensorPtr * out)164 static Status CreateFromVector(const std::vector<T> &items, const TensorShape &shape, TensorPtr *out) { 165 CHECK_FAIL_RETURN_UNEXPECTED( 166 static_cast<dsize_t>(items.size()) == shape.NumOfElements(), 167 "Number of elements in the vector does not match the number of elements of the shape required"); 168 const DataType type = DataType::FromCType<T>(); 169 // if items is empty, items_ptr would be nullptr. CreateFromMemory will handle this case. 170 const auto items_ptr = reinterpret_cast<const uchar *>(&items[0]); 171 return CreateFromMemory(shape, type, items_ptr, out); 172 } 173 174 /// Create a 1D Tensor from a given list of values. 175 /// \param[in] items elements of the tensor 176 /// \param[out] out output argument to hold the created Tensor 177 /// \return Status Code 178 template <typename T> CreateFromVector(const std::vector<T> & items,TensorPtr * out)179 static Status CreateFromVector(const std::vector<T> &items, TensorPtr *out) { 180 return CreateFromVector(items, TensorShape({static_cast<dsize_t>(items.size())}), out); 181 } 182 183 /// Create a 1D boolean Tensor from a given list of boolean values. 184 /// \param[in] items elements of the tensor 185 /// \param[in] shape shape of the output tensor 186 /// \param[out] out output argument to hold the created Tensor 187 /// \return Status Code CreateFromVector(const std::vector<bool> & items,const TensorShape & shape,TensorPtr * out)188 static Status CreateFromVector(const std::vector<bool> &items, const TensorShape &shape, TensorPtr *out) { 189 const std::vector<uint8_t> temp(items.begin(), items.end()); 190 RETURN_IF_NOT_OK(CreateFromVector(temp, shape, out)); 191 (*out)->type_ = DataType(DataType::DE_BOOL); 192 return Status::OK(); 193 } 194 195 /// Create a Tensor from a given list of strings. 196 /// @note: The memory layout of a Tensor of strings consists of the Offset_array followed by the strings. 197 /// The offset array will store one extra value to find the length of the last string. 198 /// OFFSET_1, OFFSET_2, ..., OFFSET_n+1, STRING_1, STRING_2, ..., STRING_n 199 /// The value of each offset is the start index of the corresponding string 200 /// Offsets is of type offset_t 201 /// strings will ne null-terminated 202 /// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING) 203 /// |------------------------------------------------------------------------| 204 /// | OFFSET ARRAY | STRINGS | 205 /// | bytes 0-3 | bytes 4-7 | bytes 8-11 | bytes 12-15 | bytes 16-18 | 206 /// | 12 | 16 | 19 | abc\0 | de\0 | 207 /// |------------------------------------------------------------------------| 208 /// | first offset | second offset | end offset | first value | second value | 209 /// |------------------------------------------------------------------------| 210 /// \param[in] items elements of the tensor 211 /// \param[in] shape shape of the output tensor 212 /// \param[in] type data type of the output tensor, can only be DE_STRING or DE_BYTES 213 /// \param[out] out output argument to hold the created Tensor 214 /// \return Status Code CreateFromVector(const std::vector<std::string> & items,const TensorShape & shape,const DataType & type,TensorPtr * out)215 static Status CreateFromVector(const std::vector<std::string> &items, const TensorShape &shape, const DataType &type, 216 TensorPtr *out) { 217 RETURN_UNEXPECTED_IF_NULL(out); 218 CHECK_FAIL_RETURN_UNEXPECTED(static_cast<dsize_t>(items.size()) == shape.NumOfElements(), 219 "The number of elements in the vector: " + std::to_string(items.size()) + 220 " does not match the number of elements: " + std::to_string(shape.NumOfElements()) + 221 " the shape required."); 222 CHECK_FAIL_RETURN_UNEXPECTED(type.IsString(), "Can not create a numeric Tensor from a string vector."); 223 *out = std::make_shared<Tensor>(TensorShape({static_cast<dsize_t>(items.size())}), type); 224 CHECK_FAIL_RETURN_UNEXPECTED(out != nullptr, "Allocate memory failed."); 225 if (items.empty()) { 226 if (shape.known()) { 227 return (*out)->Reshape(shape); 228 } 229 } 230 auto length_sum = [](size_t sum, const std::string &s) { return s.length() + sum; }; 231 const dsize_t total_length = std::accumulate(items.begin(), items.end(), 0, length_sum); 232 233 // total bytes needed = offset array + strings 234 // offset array needs to store one offset var per element + 1 extra to get the length of the last string. 235 // strings will be null-terminated --> need 1 extra byte per element 236 const size_t num_bytes = (kOffsetSize + 1) * (*out)->shape_.NumOfElements() + kOffsetSize + total_length; 237 238 RETURN_IF_NOT_OK((*out)->AllocateBuffer(num_bytes)); 239 auto offset_arr = reinterpret_cast<offset_t *>((*out)->data_); 240 const uchar *buf = (*out)->GetStringsBuffer(); 241 242 offset_t offset = buf - (*out)->data_; // the first string will start here 243 uint32_t i = 0; 244 for (const auto &str : items) { 245 // insert the start index of the string. 246 offset_arr[i++] = offset; 247 // insert actual string 248 const int ret_code = 249 memcpy_s((*out)->data_ + offset, num_bytes - offset, common::SafeCStr(str), str.length() + 1); 250 if (ret_code != 0) { 251 MS_LOG(ERROR) << "Cannot copy string into Tensor"; 252 } 253 // next string will be stored right after the current one. 254 offset = offset + str.length() + 1; 255 } 256 // store one more offset value so we can get the length of the last string 257 offset_arr[i] = offset; 258 259 (*out)->data_end_ = (*out)->data_ + offset_arr[i]; 260 261 MS_ASSERT(num_bytes - offset == 0); 262 if (shape.known()) { 263 RETURN_IF_NOT_OK((*out)->Reshape(shape)); 264 } 265 return Status::OK(); 266 } 267 268 // Create a string Tensor from a string vector by default. CreateFromVector(const std::vector<std::string> & items,const TensorShape & shape,TensorPtr * out)269 static Status CreateFromVector(const std::vector<std::string> &items, const TensorShape &shape, TensorPtr *out) { 270 return CreateFromVector(items, shape, DataType(DataType::DE_STRING), out); 271 } 272 273 /// Create a numeric scalar Tensor from the given value. 274 /// \tparam T type of value 275 /// \param[in] item value 276 /// \param[out] out Created tensor 277 /// \return Status code 278 template <typename T> CreateScalar(const T & item,TensorPtr * out)279 static Status CreateScalar(const T &item, TensorPtr *out) { 280 const DataType type = DataType::FromCType<T>(); 281 const auto item_ptr = reinterpret_cast<const uchar *>(&item); 282 return CreateFromMemory(TensorShape::CreateScalar(), type, item_ptr, out); 283 } 284 285 /// Create a tensor from a binary file on disk. 286 /// \param[in] path file to be read 287 /// \param[out] out Created Tensor 288 /// \return Status code 289 static Status CreateFromFile(const std::string &path, TensorPtr *out); 290 291 /// Destruct the tensor and release the memory using the allocator 292 virtual ~Tensor(); 293 294 /// Equality operator. compares tensor shape, type and data 295 /// \param[in] rhs Tensor to be compared with 296 /// \return bool 297 bool operator==(const Tensor &rhs) const; 298 299 bool operator!=(const Tensor &rhs) const { return !((*this) == rhs); } 300 301 Status to_json(nlohmann::json *out_json); 302 303 template <typename T> 304 Status to_json_convert(nlohmann::json *args); 305 306 static Status from_json(nlohmann::json op_params, std::shared_ptr<Tensor> *tensor); 307 308 template <typename T> 309 static Status from_json_convert(const nlohmann::json &json_data, const TensorShape &shape, 310 std::shared_ptr<Tensor> *tensor); 311 312 static Status from_json_convert(const nlohmann::json &json_data, const TensorShape &shape, const DataType &type, 313 std::shared_ptr<Tensor> *tensor); 314 315 /// Get item located at `index`, caller needs to provide the type. 316 /// \tparam T 317 /// \param[in] index vector<dsize_t> 318 /// \return return the item specified at index 319 template <typename T> 320 Status GetItemAt(T *o, const std::vector<dsize_t> &index) const; 321 322 /// Get string located at `index`. 323 /// \param[in] index vector<dsize_t> 324 /// \return return std::string_view specified at index 325 Status GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) const; 326 327 template <typename T> 328 Status GetUnsignedIntAt(T *o, const std::vector<dsize_t> &index) const; 329 330 template <typename T> 331 Status GetSignedIntAt(T *o, const std::vector<dsize_t> &index) const; 332 333 template <typename T> 334 Status GetFloatAt(T *o, const std::vector<dsize_t> &index) const; 335 336 /// set item at location specified by index 337 /// \param[in] index 338 /// \param[in] value of type `T` 339 template <typename T> SetItemAt(const std::vector<dsize_t> & index,const T & value)340 Status SetItemAt(const std::vector<dsize_t> &index, const T &value) { 341 T *ptr = nullptr; 342 RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index)); 343 *ptr = value; 344 return Status::OK(); 345 } 346 347 /// set string item at location specified by index 348 /// \param[in] index 349 /// \param[in] value of type std::string SetItemAt(const std::vector<dsize_t> & index,const std::string & value)350 Status SetItemAt(const std::vector<dsize_t> &index, const std::string &value) { 351 RETURN_UNEXPECTED_IF_NULL(data_); 352 uchar *ptr = nullptr; 353 offset_t length = 0; 354 RETURN_IF_NOT_OK(GetItemPtr(&ptr, index, &length)); 355 if (value.length() != length) { 356 RETURN_STATUS_UNEXPECTED("Length of the new string does not match the item."); 357 } 358 const int ret_code = memcpy_s(reinterpret_cast<char *>(ptr), length, value.c_str(), length); 359 CHECK_FAIL_RETURN_UNEXPECTED(ret_code == 0, "Failed to set data into tensor."); 360 361 return Status::OK(); 362 } 363 364 /// Fill tensor with zeros. Does not support string or bytes. Zero()365 Status Zero() { 366 CHECK_FAIL_RETURN_UNEXPECTED(!type_.IsString(), "Can not fill zeros on tensor of type string or bytes."); 367 dsize_t size = SizeInBytes(); 368 CHECK_FAIL_RETURN_UNEXPECTED(memset_sp(GetMutableBuffer(), size, 0, size) == 0, 369 "Failed to fill tensor with zeroes."); 370 return Status::OK(); 371 } 372 373 /// Fill all elements in the Tensor with the given value of type `T`. Does not support string or bytes. 374 /// \tparam T 375 /// \param value[in] 376 template <typename T> Fill(const T & value)377 Status Fill(const T &value) { 378 CHECK_FAIL_RETURN_UNEXPECTED(!type_.IsString(), "Can not fill on tensor of type string or bytes."); 379 const int64_t cellSize = type_.SizeInBytes(); 380 if ((data_ != nullptr) && type_.IsCompatible<T>()) { 381 for (dsize_t i = 0; i < Size(); i++) { 382 CHECK_FAIL_RETURN_UNEXPECTED(memcpy_s((data_ + i * cellSize), cellSize, &value, cellSize) == 0, "memcpy err"); 383 } 384 return Status::OK(); 385 } else { 386 std::string err; 387 err += (data_ == nullptr) ? "data_ is nullptr \t" : ""; 388 err += type_.IsCompatible<T>() ? "data type not compatible\t" : ""; 389 return {StatusCode::kMDUnexpectedError, err}; 390 } 391 } 392 393 /// Getter function for shape 394 /// \return shape()395 const TensorShape &shape() const { return shape_; } 396 397 /// Check if tensor has data 398 /// \return bool - true if tensor is not empty HasData()399 bool HasData() const { return data_ != nullptr; } 400 401 /// Check if tensor is complex 402 /// \return bool - true if tensor is complex IsComplex()403 bool IsComplex() const { 404 if (shape_.empty()) { 405 return false; 406 } 407 // check the last dim all be 2 408 return shape_[-1] == 2; 409 } 410 411 /// Reshape the tensor. The given shape should have the same number of elements in the Tensor 412 /// \param shape 413 virtual Status Reshape(const TensorShape &shape); 414 415 /// \return number of elements in this tensor Size()416 dsize_t Size() const { return shape().NumOfElements(); } 417 418 /// \return the number of bytes this tensor is needs SizeInBytes()419 dsize_t SizeInBytes() const { 420 if (data_end_ == nullptr) { 421 return type_.SizeInBytes() * shape_.NumOfElements(); 422 } 423 return data_end_ - data_; 424 } 425 426 /// Get the exact length of string / bytes GetStringLength(uint32_t * length)427 Status GetStringLength(uint32_t *length) const { 428 CHECK_FAIL_RETURN_UNEXPECTED(type().IsString(), "Only support to get the length of string or bytes Tensor."); 429 *length = data_end_ - data_ - (Size() + 1) * kOffsetSize - Size(); 430 return Status::OK(); 431 } 432 433 /// \return the rank of the tensor Rank()434 dsize_t Rank() const { return shape().Rank(); } 435 436 /// Get the starting memory address as a constant for the data of the tensor. This potentially 437 /// drives an allocation if the data area. 438 /// \return const unsigned char* GetBuffer()439 const uchar *GetBuffer() const { return data_; } 440 441 /// Getter of the type 442 /// \return type()443 DataType type() const { return type_; } 444 445 /// Provide stream operator for displaying the Tensor. 446 /// \param out Output stream. 447 /// \param tensor Tensor object to be printed. 448 /// \return Output stream. 449 friend std::ostream &operator<<(std::ostream &out, const Tensor &tensor) { 450 tensor.Print(out); 451 return out; 452 } 453 454 /// Invalidate this Tensor by setting the type and shape to unknown and MData to null. 455 /// Calling this method will make the Tensor and its data inaccessible, use it with caution. 456 void Invalidate(); 457 458 /// Copy input tensor into self at the location index. 459 /// Index is a vector of axes which can be incomplete: 460 /// Ex: shape <2,3>, inserting into index {0} will replace the first row. index {1,2} will replace the last cell. 461 /// \param index 462 /// \param input 463 /// \param partial_insert: boolean to determine if insertion along the full axis is enforced 464 /// \return Status code 465 Status InsertTensor(const std::vector<dsize_t> &index, const std::shared_ptr<Tensor> &input, 466 bool partial_insert = false); 467 468 /// Find the address of the given index. Used in InsertTensor. 469 /// Example: 470 /// Tensor t= [[1,2],[3,4]] , StartAddrOfIndex({0}) -> &1 471 /// \param[in] ind Element index. 472 /// \param[out] start_addr_of_index Starting address of the element index. 473 /// \param[out] remaining Remaining shape from the index. 474 /// \return Status code. 475 Status StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_index, TensorShape *remaining); 476 477 /// Expand the shape of the Tensor with one extra dimension. 478 /// For example, if the shape is <512,512,3>: 479 /// *- ExpandDim(0) gives: <1,512,512,3> 480 /// *- ExpandDim(1) gives: <512,1,512,3> 481 /// *- ExpandDim(3) gives: <512,512,3,1> 482 /// \param axis location of the dim 483 virtual Status ExpandDim(const dsize_t &axis); 484 485 virtual void Squeeze(); 486 487 /// Calculates the strides of the Tensor 488 /// Ex: Tensor of shape <4,2,2> and type DE_UINT8 (1 byte) 489 /// The strides will be {6,2,1}. 490 /// Ex: Tensor of shape <4,2,2> and type DE_UINT32 (4 byte) 491 /// The strides will be {24,8,4}. 492 /// \return vector of integers 493 std::vector<dsize_t> Strides() const; 494 ToString()495 std::string ToString() const { 496 std::stringstream ss; 497 this->Print(ss); 498 return ss.str(); 499 } 500 501 /// Handle negative indices. 502 /// \param[in] index Index to be handled. 503 /// \param[in] length Axis length of this index. 504 /// \return Handled index. HandleNeg(dsize_t index,dsize_t length)505 static inline dsize_t HandleNeg(dsize_t index, dsize_t length) { return (index < 0) ? (index + length) : index; } 506 507 /// Handle negative indices. 508 /// \param[in] index_vector Vector of indices. 509 /// \param[in] length Length of each axis. 510 /// \return Modified vector of indices. HandleNegIndices(const std::vector<dsize_t> & index_vector,const std::vector<dsize_t> & length)511 static inline std::vector<dsize_t> HandleNegIndices(const std::vector<dsize_t> &index_vector, 512 const std::vector<dsize_t> &length) { 513 if (length.size() < index_vector.size()) { 514 MS_LOG(ERROR) << "The size of length should be greater than the shape of index_vector"; 515 return {}; 516 } 517 std::vector<dsize_t> indices(index_vector.size(), 0); 518 for (size_t i = 0; i < index_vector.size(); i++) { 519 indices[i] = HandleNeg(index_vector[i], length[i]); 520 } 521 return indices; 522 } 523 524 /// Slice tensor bases on the given indices. Copy the sliced data into out tensor. 525 /// Based on the type of tensor, SliceNumeric or SliceString will be called 526 /// \param[out] out Tensor 527 /// \param[in] slice_options vector of SliceOption objects 528 /// \return Status error code 529 Status Slice(TensorPtr *out, const std::vector<mindspore::dataset::SliceOption> &slice_options); 530 531 /// Get slice_option according to shape and index. 532 /// \param[in] slice_option input SliceOption object 533 /// \param[in] slice_index index of SliceOption object 534 /// \param[out] output slice_option with shape info 535 /// \return Status error code 536 Status GetSliceOption(const SliceOption &slice_option, const int32_t &slice_index, SliceOption *slice_option_ptr); 537 538 #ifdef ENABLE_PYTHON 539 /// Constructs numpy array from input tensor 540 /// \param[out] data this data is the location of python data 541 /// \return Status code 542 Status GetDataAsNumpy(py::array *data); 543 544 /// Constructs numpy array of string or bytes 545 /// \param[out] data this data is the location of python data 546 /// \return Status code 547 Status GetDataAsNumpyStrings(py::array *data); 548 549 template <typename T> GetDataAsNumpyStrings(py::array * data)550 Status GetDataAsNumpyStrings(py::array *data) { 551 RETURN_UNEXPECTED_IF_NULL(data); 552 if (Size() == 0) { 553 // NumPy will create empty array in type of float64 by default. So we must define the data type. 554 *data = py::array(type_.AsNumpyType(), shape_.AsVector(), nullptr); 555 } else { 556 std::vector<T> string_vector; 557 string_vector.reserve(Size()); 558 // Iterate over tensor and create a vector of string_views of strings in the tensor. 559 (void)std::transform(begin<std::string_view>(), end<std::string_view>(), std::back_inserter(string_vector), 560 [](const auto &element) { return static_cast<std::string>(element); }); 561 *data = py::array(py::cast(string_vector)); 562 data->resize(shape_.AsVector()); 563 } 564 return Status::OK(); 565 } 566 567 static Status GetBufferInfo(Tensor *t, py::buffer_info *out); 568 569 /// Returns the Python dictionary stored in the tensor 570 /// \param[out] data this data is the location of Python data (pybind11 wrapper) 571 /// \return Status code 572 Status GetDataAsPythonObject(py::dict *data); 573 574 #endif 575 SetYuvShape(const uint32_t & width,const uint32_t & widthStride,const uint32_t & height,const uint32_t & heightStride)576 Status SetYuvShape(const uint32_t &width, const uint32_t &widthStride, const uint32_t &height, 577 const uint32_t &heightStride) { 578 const std::vector<uint32_t> tmp{width, widthStride, height, heightStride}; 579 yuv_shape_ = tmp; 580 return Status::OK(); 581 } 582 GetYuvShape()583 std::vector<uint32_t> GetYuvShape() { return yuv_shape_; } 584 585 /// TensorIterator is a linear iterator that can be used to iterate over the elements of the Tensor 586 /// The order elements is as the memory layout (i.e., row-major) [[1,2,3],[4,5,6] --> 1,2,3,4,5,6 587 /// \tparam T type of values in the Tensor Iterator 588 template <typename T, bool = true> 589 class TensorIterator { 590 public: 591 using iterator_category = std::random_access_iterator_tag; 592 using value_type = T; 593 using difference_type = ptrdiff_t; 594 using pointer = T *; 595 using reference = T &; 596 ptr_(reinterpret_cast<T * > (ptr))597 explicit TensorIterator(uchar *ptr = nullptr) : ptr_(reinterpret_cast<T *>(ptr)) {} 598 TensorIterator(const TensorIterator<T> & raw_iterator)599 TensorIterator(const TensorIterator<T> &raw_iterator) { ptr_ = raw_iterator.ptr_; } 600 601 ~TensorIterator() = default; 602 603 TensorIterator<T> &operator=(const TensorIterator<T> &rhs) { 604 if (this == &rhs) { 605 return *this; 606 } 607 ptr_ = rhs.ptr_; 608 return *this; 609 } 610 611 TensorIterator<T> &operator=(T *rhs) { 612 ptr_ = rhs; 613 return *this; 614 } 615 616 bool operator==(const TensorIterator<T> &rhs) const { return ptr_ == rhs.ptr_; } 617 618 bool operator!=(const TensorIterator<T> &rhs) const { return !(*this == rhs); } 619 620 operator bool() const { return ptr_ != nullptr; } 621 622 T &operator*() { return *ptr_; } 623 624 const T &operator*() const { return *ptr_; } 625 626 T *operator->() { return ptr_; } 627 628 TensorIterator<T> &operator+=(const ptrdiff_t &inc) { 629 ptr_ += inc; 630 return *this; 631 } 632 633 TensorIterator<T> &operator-=(const ptrdiff_t &inc) { 634 ptr_ -= inc; 635 return *this; 636 } 637 638 TensorIterator<T> &operator++() { 639 ++ptr_; 640 return *this; 641 } 642 643 TensorIterator<T> &operator--() { 644 --ptr_; 645 return *this; 646 } 647 648 TensorIterator<T> operator++(int) { 649 auto temp(*this); 650 ++ptr_; 651 return temp; 652 } 653 654 TensorIterator<T> operator--(int) { 655 auto temp(*this); 656 --ptr_; 657 return temp; 658 } 659 660 TensorIterator<T> operator+(const ptrdiff_t &inc) { 661 auto temp(*this); 662 temp.ptr_ += inc; 663 return temp; 664 } 665 666 TensorIterator<T> operator-(const ptrdiff_t &inc) { 667 auto temp(*this); 668 temp.ptr_ -= inc; 669 return temp; 670 } 671 672 protected: 673 T *ptr_; 674 }; 675 676 // Specialization of TensorIterator for strings. It returns std::string_view for every item. 677 // \tparam DUMMY, used to mbe able to specialize the inner class 678 template <bool DUMMY> 679 class TensorIterator<std::string_view, DUMMY> { 680 public: 681 using iterator_category = std::random_access_iterator_tag; 682 using value_type = std::string_view; 683 using difference_type = ptrdiff_t; 684 using pointer = std::string_view *; 685 using reference = std::string_view &; 686 687 explicit TensorIterator(const uchar *data = nullptr, dsize_t index = 0) { 688 data_ = reinterpret_cast<const char *>(data); 689 index_ = index; 690 } 691 TensorIterator(const TensorIterator<std::string_view,DUMMY> & raw_iterator)692 TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) { 693 data_ = raw_iterator.data_; 694 index_ = raw_iterator.index_; 695 } 696 697 ~TensorIterator() = default; 698 699 bool operator==(const TensorIterator<std::string_view> &rhs) const { 700 return data_ == rhs.data_ && index_ == rhs.index_; 701 } 702 703 bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); } 704 705 operator bool() const { return data_ != nullptr; } 706 707 std::string_view operator*() const { 708 const auto offset_ = reinterpret_cast<const offset_t *>(data_); 709 const offset_t start = offset_[index_]; 710 const offset_t end = offset_[index_ + 1]; 711 return std::string_view{data_ + start, end - start - 1}; // -1 to skip the \0 at the end 712 } 713 714 TensorIterator<std::string_view> &operator+=(const dsize_t &inc) { 715 index_ += inc; 716 return *this; 717 } 718 719 TensorIterator<std::string_view> &operator-=(const dsize_t &inc) { 720 index_ -= inc; 721 return *this; 722 } 723 724 TensorIterator<std::string_view> &operator++() { 725 ++index_; 726 return *this; 727 } 728 729 TensorIterator<std::string_view> &operator--() { 730 --index_; 731 return *this; 732 } 733 734 TensorIterator<std::string_view> operator++(int) { 735 auto temp(*this); 736 ++index_; 737 return temp; 738 } 739 740 TensorIterator<std::string_view> operator--(int) { 741 auto temp(*this); 742 --index_; 743 return temp; 744 } 745 746 TensorIterator<std::string_view> operator+(const dsize_t &inc) { 747 auto temp(*this); 748 temp.index_ += inc; 749 return temp; 750 } 751 752 TensorIterator<std::string_view> operator-(const dsize_t &inc) { 753 auto temp(*this); 754 temp.index_ -= inc; 755 return temp; 756 } 757 758 protected: 759 dsize_t index_; 760 const char *data_; 761 }; 762 763 /// Return a TensorIterator that points to the start of the Tensor. 764 /// It's the user responsibility to use the correct type that matches the Tensor type 765 /// \tparam T The type of values in the Tensor 766 /// \return TensorIterator 767 template <typename T> begin()768 TensorIterator<T> begin() { 769 return TensorIterator<T>(data_); 770 } 771 772 /// Return a linear iterator that points to the place after the last element of the Tensor. 773 /// \tparam T The type of values in the Tensor 774 /// \return TensorIterator 775 template <typename T> end()776 TensorIterator<T> end() { 777 return TensorIterator<T>(data_end_); 778 } 779 780 /// Copies the last dimension at `index` from Tensor `src` to this Tensor. 781 /// \param[in] src Tensor 782 /// \param[in] index vector to the start of the dimension. The last dim should be 0 783 /// \return Status 784 Status CopyLastDimAt(const std::shared_ptr<Tensor> &src, const std::vector<dsize_t> &index); 785 786 /// Get the starting memory address for the data of the tensor. This potentially 787 /// drives an allocation if the data is null. 788 /// \return unsigned char* GetMutableBuffer()789 uchar *GetMutableBuffer() { return data_; } 790 791 /// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if 792 /// the tensor's type is a string, otherwise undefined address would be returned. 793 /// \return return the address of the first string of the tensor. GetStringsBuffer()794 uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; } 795 796 protected: 797 /// Allocate memory for the tensor using the data_allocator 798 /// \param[in] length number of bytes to be allocated 799 /// \return Error Status 800 Status AllocateBuffer(const dsize_t &length); 801 802 /// A function that prints Tensor recursively, first called by print 803 /// \param[in] out 804 /// \param[in] cur_dim 805 /// \param[in] cur_index 806 void PrintRecursive(std::ostream &out, int32_t cur_dim, const std::vector<dsize_t> &cur_index) const; 807 808 /// Print the info and data of tensor. 809 /// \param[out] out Output stream. 810 void Print(std::ostream &out) const; 811 812 /// Print the data of tensor. 813 /// \param[out] out Output stream. 814 void PrintData(std::ostream &out) const; 815 816 /// A function that print the value as specified by its index 817 /// \param[in] index vector representing the index 818 /// \param[out] out 819 void PrintItemAt(const std::vector<dsize_t> &index, std::ostream &out) const; 820 821 /// Get pointer to item located at `index`, caller needs to provide the type. 822 /// \tparam T 823 /// \param[in] index vector<dsize_t> 824 /// \return return a pointer to the item specified at index of type `T` 825 template <typename T> 826 Status GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const; 827 828 /// Get pointer to string located at `index` and the length of string 829 /// \param[in] index vector<dsize_t> 830 /// \return return a pointer to the string specified at index and the length of the string 831 Status GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset_t *length = nullptr) const; 832 833 /// Given a flat index of an item string, return the start and length of the item. 834 /// \param[in] index Flat index of the item. 835 /// \param[out] string_start Starting address of the ths string. 836 /// \param[out] length Length of the string. 837 /// \return Status code. 838 Status GetStringAt(dsize_t index, uchar **string_start, offset_t *length) const; 839 GetAllocator()840 static const std::unique_ptr<Allocator<unsigned char>> &GetAllocator() { 841 static auto allocator = std::make_unique<Allocator<unsigned char>>(GlobalContext::Instance()->mem_pool()); 842 return allocator; 843 } 844 845 /// all access to shape_ should be via shape 846 TensorShape shape_; 847 /// data type of tensor 848 DataType type_; 849 /// pointer to the start of the physical data 850 unsigned char *data_; 851 /// pointer to the end of the physical data 852 unsigned char *data_end_ = nullptr; 853 854 /// shape for interpretation of YUV image 855 std::vector<uint32_t> yuv_shape_; 856 857 #ifdef ENABLE_PYTHON 858 /// Store python dictionary wrapper 859 py::object python_dict_; 860 std::string python_dict_as_str_; 861 862 /// Hold the np.ndarray which is from python layer without memcpy cost 863 py::buffer python_array_; 864 #endif 865 866 private: 867 friend class DETensor; 868 869 /// Slice numeric tensors. 870 Status SliceNumeric(TensorPtr *out, const std::vector<std::vector<dsize_t>> &indices, const TensorShape &shape); 871 872 /// Slice string tensors 873 Status SliceString(TensorPtr *out, const std::vector<std::vector<dsize_t>> &indices, const TensorShape &shape); 874 875 /// Copy raw data of a array based on shape and strides to the destination pointer 876 /// \param dst [out] Pointer to the destination array where the content is to be copied 877 /// \param[in] src Pointer to the source of strided array to be copied 878 /// \param[in] shape shape of the source array 879 /// \param[in] strides strides of the source array 880 /// \param[in] type_size number of bytes needed to store one array element's type 881 /// \return Status Code 882 static Status CopyStridedArray(unsigned char *dst, unsigned char *src, std::vector<dsize_t> shape, 883 std::vector<dsize_t> strides, uint8_t type_size); 884 885 #ifdef ENABLE_PYTHON 886 /// Helper function to create a tensor from Numpy array of strings 887 /// \param[in] arr Numpy array 888 /// \param[out] out Created Tensor 889 /// \return Status 890 static Status CreateFromNpString(py::array arr, TensorPtr *out); 891 #endif 892 }; 893 894 template <> 895 inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() { 896 return TensorIterator<std::string_view>(data_, shape_.NumOfElements()); 897 } 898 899 /// Create a string scalar Tensor from the given value. 900 /// \param[in] item value 901 /// \param[out] out Created tensor 902 /// \return Status code 903 template <> 904 inline Status Tensor::CreateScalar<std::string>(const std::string &item, TensorPtr *out) { 905 RETURN_UNEXPECTED_IF_NULL(out); 906 return CreateFromVector({item}, TensorShape::CreateScalar(), DataType(DataType::DE_STRING), out); 907 } 908 } // namespace mindspore::dataset 909 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_CORE_TENSOR_H_ 910