#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef AT_PER_OPERATOR_HEADERS #include #include #else #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif #include #include #include #include namespace at::meta { inline void cat_check_no_zero_dim(const MaterializedITensorListRef& tensors) { size_t i = 0; for (const Tensor& t : tensors) { TORCH_CHECK( t.dim() > 0, "zero-dimensional tensor (at position ", i, ") cannot be concatenated"); i++; } } inline c10::MemoryFormat cat_compute_output_memory_format(const MaterializedITensorListRef& inputs) { std::optional format = std::nullopt; for (const Tensor& t : inputs) { auto f = t.suggest_memory_format(); if (f == c10::MemoryFormat::Contiguous) { return f; } if (format.has_value() && format.value() != f) { return c10::MemoryFormat::Contiguous; } format = f; } return format.value(); } TORCH_PRECOMPUTE_META_FUNC(cat)(const ITensorListRef& tensors, int64_t dim) { // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors // to be "skipped". We maintain this behavior for backwards compatibility, but only for this specific // size (i.e. other empty sizes are not skipped). auto materialized = tensors.materialize(); cat_check_no_zero_dim(materialized); dim = at::legacy_cat_wrap_dim(dim, materialized); // Checking names before the actual dimensions. auto maybe_outnames = namedinference::compute_cat_outnames(materialized); TORCH_CHECK( !materialized.empty(), "torch.cat(): expected a non-empty list of Tensors"); // Look for the first valid tensor. size_t valid = materialized.size(); for (const auto i : c10::irange(materialized.size())) { if (!at::native::cat_should_skip_tensor(materialized[i].get())) { valid = i; break; } } bool all_contiguous = true; bool all_same_dtype = true; bool all_same_sizes_and_stride = true; auto memory_format = cat_compute_output_memory_format(materialized); // Compute what the output dtype should be: const auto& result = maybe_get_output(); auto is_out_defined = result.defined(); auto out_dtype = at::native::result_type(tensors); // If the output tensor is defined, we need to take it into account // when computing the actual output dtype and the flags. if (is_out_defined) { // Check for type promotion, if the output tensor is defined. TORCH_CHECK( canCast(out_dtype, result.scalar_type()), "torch.cat(): input types can't be cast to the desired output type ", result.scalar_type()); out_dtype = result.scalar_type(); all_contiguous = result.is_contiguous(memory_format); } // Fallback 'set_output' parameters. // (in case we don't find a valid tensor) DimVector sizes {0}; TensorOptions options = materialized[0].get().options() .dtype(out_dtype) .memory_format(memory_format); // If we found a valid tensor, check whether the input tensors // are compatible, i.e. we can execute `cat` on them. bool found_valid_tensor = valid < materialized.size(); if (found_valid_tensor) { TORCH_CHECK( dim <= materialized[valid].get().dim(), "torch.cat(): dimension ", dim, "out of range"); // Compute the output tensor size. // It should have the same shape as any other valid tensor, // except in the dimension 'dim'. size_t size_at_dim = 0; for (const auto i : c10::irange(materialized.size())) { const Tensor& t = materialized[i]; all_same_dtype = all_same_dtype && out_dtype == t.scalar_type(); if (!at::native::cat_should_skip_tensor(t)) { at::native::check_cat_shape_except_dim(materialized[valid], t, dim, i); size_at_dim += t.size(dim); all_contiguous = all_contiguous && t.is_contiguous(memory_format); all_same_sizes_and_stride = all_same_sizes_and_stride && t.sizes() == materialized[valid].get().sizes() && t.strides() == materialized[valid].get().strides(); } else { all_contiguous = false; } } // Actually set the output. sizes = materialized[valid].get().sizes().vec(); sizes[dim] = size_at_dim; options = materialized[valid].get().options() .dtype(out_dtype) .memory_format(memory_format); } set_output_raw_strided(0, sizes, {}, options, maybe_outnames); // Checks for overlaps between the inputs and the output tensor. if (is_out_defined && found_valid_tensor) { at::assert_no_internal_overlap(result); for (const Tensor& t : materialized) { at::assert_no_overlap(result, t); } } return TORCH_PRECOMPUTE_STRUCT(cat)() .set_dim(dim) .set_valid(valid) .set_all_contiguous(all_contiguous) .set_all_same_dtype(all_same_dtype) .set_all_same_sizes_and_stride(all_same_sizes_and_stride) .set_memory_format(memory_format); } } // namespace at::meta namespace at::native { DEFINE_DISPATCH(cat_serial_stub); DEFINE_DISPATCH(stack_serial_stub); Tensor _reshape_from_tensor(const Tensor& self, const Tensor& shape_tensor) { TORCH_CHECK(shape_tensor.dim() == 1); std::vector shape; auto accessor = shape_tensor.accessor(); for (const auto i : c10::irange(shape_tensor.numel())) { shape.push_back(accessor[i]); } return self.reshape(IntArrayRef(shape)); } Tensor _shape_as_tensor(const Tensor& self) { auto options = TensorOptions(at::kLong); return at::tensor(self.sizes(), options); } Tensor& set_(Tensor& result, Storage source) { int64_t new_size = static_cast(source.nbytes() / result.dtype().itemsize()); return result.set_(std::move(source), 0, new_size, {}); } // unify with cuda implementation? This is not done to avoid a dispatch in resize_impl_cpu_ Tensor& set_storage_cpu_(Tensor& result, Storage storage, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) { checkSetStorage(result, std::move(storage), storage_offset, size, stride); result.unsafeGetTensorImpl()->set_storage_offset(storage_offset); at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ? at::OptionalIntArrayRef(stride) : std::nullopt; // We can re-use this kernel for the meta device. // We just need to make sure we don't actually try to resize the (null) storage. at::native::resize_impl_cpu_(result.unsafeGetTensorImpl(), size, stride_opt, /*resize_storage=*/!result.is_meta()); return result; } Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) { checkSetStorage(result, storage, storage_offset, size, stride); c10::SymDimVector contiguous_strides; if (stride.data() == nullptr) { // TODO: dedupe this with empty() symbolic logic int64_t dim = size.size(); contiguous_strides.resize(dim); if (dim > 0) { const auto last_idx = dim - 1; contiguous_strides.at(last_idx) = 1; for (auto i = last_idx - 1; i >= 0; --i) { // TODO: max with 1 contiguous_strides.at(i) = contiguous_strides.at(i+1) * size.at(i+1); } } stride = contiguous_strides; } // Run this before storage setting so we can access numel result.unsafeGetTensorImpl()->set_sizes_and_strides(size, stride, storage_offset); // Matches maybe_resize_storage_cpu no-numel behavior if (TORCH_GUARD_SIZE_OBLIVIOUS(result.sym_numel().sym_ne(0))) { // maybe_resize_storage_cpu can handle no storage exists at all but // that should never be the case here TORCH_INTERNAL_ASSERT(storage); TORCH_CHECK(storage.resizable(), "Trying to resize storage that is not resizable"); // All meta data pointers are the same, so we don't have to "re" allocate // it. TODO: Actually this might not quite be correct if we use special // pointers to track whether or not fake cuda tensors are pinned or not const auto itemsize = result.dtype().itemsize(); c10::SymInt new_size_bytes = result.is_contiguous() ? at::detail::computeStorageNbytesContiguous(size, itemsize, std::move(storage_offset)) : at::detail::computeStorageNbytes(size, stride, itemsize, std::move(storage_offset)); // TODO: When there are unbacked SymInts, we unconditionally skip the // setter. This is technically wrong, but we cannot conveniently test // the real condition in many cases, because a lot of people are using // set_ just to swizzle metadata on a tensor, they didn't actually want // to see if they need to resize the storage. // // The old behavior was to unconditionally set_nbytes, but I think not // setting it is more safe. if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() && TORCH_GUARD_SIZE_OBLIVIOUS(new_size_bytes.sym_gt(storage.sym_nbytes()))) { storage.set_nbytes(std::move(new_size_bytes)); } } return result; } Tensor& set__symint(Tensor& result, const Tensor& storage, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) { TORCH_CHECK(storage.is_contiguous(), "passed in tensor to be used as storage must be contiguous"); return result.set__symint(storage.storage(), storage_offset + storage.sym_storage_offset(), size, stride); } Tensor& set_tensor_(Tensor& result, const Tensor& source) { if (result.unsafeGetTensorImpl() != source.unsafeGetTensorImpl()) { return result.set__symint(source.storage(), source.sym_storage_offset(), source.sym_sizes(), source.sym_strides()); } return result; } // this needs to be split along CPU/CUDA lines because we don't have a consistent // way of getting the allocator to use for a device (c10::GetAllocator is not // the same as at::cuda::getCUDADeviceAllocator(). Tensor& set_cpu_(Tensor& result) { caffe2::TypeMeta dtype = result.dtype(); Storage storage( Storage::use_byte_size_t(), 0, c10::GetAllocator(kCPU), true); result.set_(std::move(storage), 0, {0}, {}); TORCH_INTERNAL_ASSERT(dtype == result.dtype()); return result; } // We can't re-use the cpu kernel here because we don't want to use the cpu allocator. Tensor& set_meta_(Tensor& result) { caffe2::TypeMeta dtype = result.dtype(); Storage storage( Storage::use_byte_size_t(), 0, c10::GetAllocator(kMeta), true); result.set_(std::move(storage), 0, {0}, {}); TORCH_INTERNAL_ASSERT(dtype == result.dtype()); return result; } Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) { TORCH_CHECK(self.is_sparse(), "input must be sparse tensor"); int64_t sparse_extra_ndim = size.size() - self.dim(); int64_t sparse_ndim = size.size() - self.dense_dim(); TORCH_CHECK(sparse_extra_ndim >= 0, "input not broadcastable to size with smaller dimensionality"); Tensor indices = self._indices(); Tensor values = self._values(); auto nnz = values.size(0); std::vector broadcast_sizes; std::vector broadcast_dense_sizes; std::vector broadcast_dims; std::vector unchanged_dims; broadcast_sizes.reserve(sparse_ndim); broadcast_dense_sizes.reserve(self.dense_dim() + 1); broadcast_dims.reserve(self.sparse_dim()); unchanged_dims.reserve(self.sparse_dim()); int64_t nnz_factor = 1; int64_t min_broadcast_dim = (sparse_extra_ndim > 0 ? 0: -1); int64_t max_unchanged_dim = -1; for (int64_t i=0; i new_indices_size{sparse_ndim, nnz * nnz_factor}; std::vector new_values_size(values.sizes().vec()); new_values_size[0] = new_indices_size[1]; Tensor new_values = values.expand(broadcast_dense_sizes).repeat_interleave(nnz_factor, 0); Tensor new_indices = indices.new_empty(new_indices_size); if (!broadcast_sizes.empty()) { Tensor broadcast_indices = at::sparse::full_coo_indices(broadcast_sizes, indices.options()).tile(nnz); new_indices.narrow(0, 0, sparse_extra_ndim).copy_(broadcast_indices.narrow(0, 0, sparse_extra_ndim)); for (size_t i=0; i broadcast_tensors(TensorList tensors) { return expand_outplace(tensors); } static void fastCatOutDim0(const Tensor& out, const MaterializedITensorListRef& inputs) { auto outBytes = out.nbytes(); char* dataPtr = reinterpret_cast(out.data_ptr()); size_t totalBytes = 0; for (const Tensor& input : inputs) { TORCH_CHECK(outBytes >= totalBytes); if (input.nbytes() > 0) { std::memcpy(dataPtr + totalBytes, input.const_data_ptr(), input.nbytes()); } totalBytes += input.nbytes(); } TORCH_CHECK(outBytes == totalBytes); } TORCH_IMPL_FUNC(cat_out_cpu) (const ITensorListRef& tensors, int64_t dim, int64_t valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, MemoryFormat memory_format, const Tensor& result) { if (result.numel() == 0) { return; } auto materialized = tensors.materialize(); bool use_serial_kernel = result.numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1; ScalarType dtype = materialized[valid].get().scalar_type(); bool serial_dtype = at::isFloatingType(dtype); // fast path for single thread when both inputs and result are contiguous and // not empty, and concat dim is 0 if (use_serial_kernel && all_contiguous && all_same_dtype && (MemoryFormat::Contiguous == memory_format)) { if (dim == 0) { fastCatOutDim0(result, materialized); return; } // TODO: Add fast cat for higher dimensions and support multi-threaded fast cat } // fast path for single thread when both inputs and result are contiguous and not empty if (use_serial_kernel && all_contiguous && all_same_dtype && serial_dtype) { cat_serial_stub(kCPU, result, materialized, dim); return; } int64_t offset = 0; if (all_same_sizes_and_stride && result.is_contiguous(memory_format) && all_same_dtype) { const Tensor& source_slice = materialized[valid]; auto slice_dim_size = source_slice.sizes()[dim]; auto result_slice = result.narrow(dim, 0, slice_dim_size); auto result_slice_data = result_slice.data_ptr(); auto result_stride_bytes = result.stride(dim) * elementSize(result.scalar_type()); auto iter = TensorIteratorConfig() .set_check_mem_overlap(false) .resize_outputs(false) .add_output(result_slice) .add_const_input(source_slice) .enforce_safe_casting_to_output(true) .build(); for (const Tensor& tensor : materialized) { if (cat_should_skip_tensor(tensor)) { continue; } auto source_data = static_cast(tensor.const_data_ptr()); auto result_data = static_cast(result_slice_data) + offset * result_stride_bytes; iter.unsafe_replace_operand(0, result_data); iter.unsafe_replace_operand(1, const_cast(source_data)); copy_stub(iter.device_type(), iter, false); offset += slice_dim_size; } } else { for (const Tensor& tensor: materialized) { if (cat_should_skip_tensor(tensor)) { continue; } auto slice_dim_size = tensor.sizes()[dim]; auto result_slice = result.narrow(dim, offset, slice_dim_size); auto iter = TensorIteratorConfig() .set_check_mem_overlap(false) // Already checked above .resize_outputs(false) .add_output(result_slice) .add_const_input(tensor) .promote_inputs_to_common_dtype(true) .cast_common_dtype_to_outputs(true) .enforce_safe_casting_to_output(true) .build(); copy_stub(iter.device_type(), iter, false); offset += slice_dim_size; } } } Tensor& cat_out(TensorList tensors, Dimname dim, Tensor& result) { TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors"); return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim)); } Tensor cat(TensorList tensors, Dimname dim) { TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors"); return at::cat(tensors, dimname_to_position(tensors[0], dim)); } // torch.concat, alias for torch.cat Tensor& concat_out(TensorList tensors, Dimname dim, Tensor& result) { return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim)); } Tensor concat(TensorList tensors, Dimname dim) { return at::cat(tensors, dimname_to_position(tensors[0], dim)); } Tensor & concat_out(TensorList tensors, int64_t dim, Tensor & result) { return at::cat_out(result, tensors, dim); } Tensor concat(TensorList tensors, int64_t dim) { return at::cat(tensors, dim); } // torch.concatenate, alias for torch.cat Tensor& concatenate_out(TensorList tensors, Dimname dim, Tensor& result) { return at::cat_out(result, tensors, dimname_to_position(tensors[0], dim)); } Tensor concatenate(TensorList tensors, Dimname dim) { return at::cat(tensors, dimname_to_position(tensors[0], dim)); } Tensor& concatenate_out(TensorList tensors, int64_t dim, Tensor & result) { return at::cat_out(result, tensors, dim); } Tensor concatenate(TensorList tensors, int64_t dim) { return at::cat(tensors, dim); } static bool sizes_match_except(IntArrayRef s1, IntArrayRef s2, int64_t dim_except /* should already be wrapped */) { if (s1.size() != s2.size()) { return false; } for (const auto i : c10::irange(static_cast(s1.size()))) { if (i != dim_except && s1[i] != s2[i]) { return false; } } return true; } // Check to see if the shape of tensors is compatible // for being concatenated along a given dimension. static void check_cat_sparse_dims(Tensor const &t, int64_t pos /* used only for debug messages */, IntArrayRef sizes, int64_t wrapped, int64_t sparse_dim, int64_t dense_dim) { TORCH_CHECK(t.is_sparse(), "Concatenating sparse tensors, but a dense tensor was found at position ", pos, "."); TORCH_CHECK(sizes_match_except(sizes, t.sizes(), wrapped), "All tensors must have the same shape: ", sizes, " (except in the concatenating dimension)," " but found shape: ", t.sizes(), " at position ", pos, "."); TORCH_CHECK(t.sparse_dim() == sparse_dim && t.dense_dim() == dense_dim, "All tensors must have the same sparse_dim and dense_dim: ", sparse_dim, ", ", dense_dim, ", but tensor at position ", pos, " has ", t.sparse_dim(), ", ", t.dense_dim(), "."); } static Tensor cat_sparse_impl(const MaterializedITensorListRef& tensors, int64_t dim) { std::vector indices; std::vector values; int64_t wrapped = maybe_wrap_dim(dim, tensors[0].get().dim()); int64_t sparse_dim = tensors[0].get().sparse_dim(); int64_t dense_dim = tensors[0].get().dense_dim(); IntArrayRef sizes = tensors[0].get().sizes(); if (wrapped < sparse_dim) { for (const auto i : c10::irange(tensors.size())) { const Tensor& t = tensors[i]; check_cat_sparse_dims(t, i, sizes, wrapped, sparse_dim, dense_dim); indices.push_back(t._indices()); values.push_back(t._values()); } Tensor idxs = at::cat(indices, 1); Tensor vals = at::cat(values, 0); // We now need to move the indices of each // input tensor up along `dim` by an appropriate amount. // E.g., if t1 has indices [[2,3,4],[5,6,7]], // and sizes [10, 7] // then torch.cat((t1,t1,t1),1) should have indices // [[2,3,4,2,3,4,2,3,4],[5,6,7,12,13,14,19,20,21]], // so we need to increase idxs[1][3:6] by 7 // and idxs[1][6:9] by 14. int64_t col = 0; int64_t cumulative_offset = 0; for (const auto i : c10::irange(tensors.size())) { const Tensor& t = tensors[i]; int64_t this_piece_size = t._nnz(); // cumulative_offset is zero for the first piece, so // don't waste time doing this operation unless i > 0. if (i > 0) { idxs[wrapped].narrow(0, col, this_piece_size) += cumulative_offset; } cumulative_offset += t.size(wrapped); col += this_piece_size; } auto sizes_copy = sizes.vec(); sizes_copy[wrapped] = cumulative_offset; return native::sparse_coo_tensor( idxs, vals, sizes_copy, optTypeMetaToScalarType(tensors[0].get().options().dtype_opt()), tensors[0].get().options().layout_opt(), tensors[0].get().options().device_opt(), tensors[0].get().options().pinned_memory_opt()); } else { // Catting along a dense dimension requires us to create new values. // For illustration, consider the sparse 3d tensors t1 and t2, // given by t1 = [[[1,2],[3,4]], ... (zeros) ..., [[5,6],[7,8]]] // and t2 = [... (zeros) ..., [[9, 10], [11,12]], ... (zeros) ...], // Their concatenation along dimension 2 is: // [[[1,2,0,0],[3,4,0,0]], ... (zeros) ..., [[0,0,9,10],[0,0,11,12]], ... (zeros) ..., [[5,6,0,0],[7,8,0,0]]] // // Their values tensors are, respectively, // [[[1,2],[3,4]],[[5,6],[7,8]]] and [[[9,10],[11,12]]]. // // and so the values tensor of their concatenation along dim 2 will be: // [[[1,2,0,0],[3,4,0,0]],[[5,6,0,0],[7,8,0,0]],[[0,0,9,10],[0,0,11,12]]] // // which we can get by taking the values tensor of each tensor, catting it with zeros of the appropriate size on the left and right, // and then catting all those results together. // The dimension in each tensor's values object that corresponds to the overall dimension along which we're catting. int64_t values_dim = wrapped - sparse_dim + 1; // The final size along the catted dimension. const int64_t total_size = std::accumulate( tensors.begin(), tensors.end(), static_cast(0), [values_dim](int64_t l, const Tensor& r) { return l + r._values().size(values_dim); }); auto zeros_sizes = tensors[0].get()._values().sizes().vec(); int64_t cumulative_size = 0; std::vector vals_pieces; std::vector idxs_pieces; for (const auto i : c10::irange(tensors.size())) { const Tensor& t = tensors[i]; check_cat_sparse_dims(t, i, sizes, wrapped, sparse_dim, dense_dim); // dimension 0 of values corresponds to the number of values, // rather than to any logical dimension of the sparse tensor. zeros_sizes[0] = t._values().size(0); zeros_sizes[values_dim] = cumulative_size; cumulative_size += t._values().size(values_dim); auto z1 = at::zeros( zeros_sizes, optTypeMetaToScalarType(t._values().options().dtype_opt()), t._values().options().layout_opt(), t._values().options().device_opt(), t._values().options().pinned_memory_opt()); zeros_sizes[values_dim] = total_size - cumulative_size; auto z2 = at::zeros( zeros_sizes, optTypeMetaToScalarType(t._values().options().dtype_opt()), t._values().options().layout_opt(), t._values().options().device_opt(), t._values().options().pinned_memory_opt()); vals_pieces.push_back(at::cat({z1, t._values(), z2}, values_dim)); idxs_pieces.push_back(t._indices()); } auto sizes_copy = sizes.vec(); sizes_copy[wrapped] = total_size; // This can create an uncoalesced tensor return native::sparse_coo_tensor( at::cat(idxs_pieces, 1), at::cat(vals_pieces), sizes_copy, optTypeMetaToScalarType(tensors[0].get().options().dtype_opt()), tensors[0].get().options().layout_opt(), tensors[0].get().options().device_opt(), tensors[0].get().options().pinned_memory_opt()); } } Tensor cat_sparse(const ITensorListRef& tensors, int64_t dim) { auto materialized = tensors.materialize(); auto maybe_outnames = namedinference::compute_cat_outnames(materialized); auto result = cat_sparse_impl(materialized, at::legacy_cat_wrap_dim(dim, materialized)); namedinference::propagate_names_if_nonempty(result, maybe_outnames); return result; } Tensor block_diag(TensorList tensors) { Tensor result; if (tensors.empty()) { result = at::empty({1, 0}); return result; } const Device& device = tensors[0].device(); for (const auto tensor_idx : c10::irange(tensors.size())) { const Tensor& tensor = tensors[tensor_idx]; TORCH_CHECK( tensor.device() == device, "torch.block_diag: input tensors must all be on the same device.", " Input 0 is on device ", device, " and input ", tensor_idx, " is on device ", tensor.device() ); } ScalarType output_scalar_type = native::result_type(tensors); int64_t result_dim0 = 0; int64_t result_dim1 = 0; std::vector tensors_2D(tensors.size()); // Sum the dimensions of the tensors, check tensor sizes, // and expand all 0-D and 1-D tensors so that everything // is 2-D for (const auto tensor_idx : c10::irange(tensors.size())) { const Tensor& tensor = tensors[tensor_idx]; int64_t ndims = tensor.dim(); TORCH_CHECK( ndims <= 2, "torch.block_diag: Input tensors must have 2 or fewer dimensions. Input ", tensor_idx, " has ", ndims, " dimensions" ); int64_t dim0 = 1; int64_t dim1 = 1; if (ndims == 2) { dim0 = tensor.size(0); dim1 = tensor.size(1); tensors_2D[tensor_idx] = tensor; } else if (ndims == 1) { // Switching dim 0 to dim 1 is intentional dim1 = tensor.size(0); tensors_2D[tensor_idx] = tensor.expand({dim0, dim1}); } else { tensors_2D[tensor_idx] = tensor.expand({dim0, dim1}); } result_dim0 += dim0; result_dim1 += dim1; } result = at::zeros( {result_dim0, result_dim1}, tensors[0].options().dtype(output_scalar_type) ); int64_t cur_dim0 = 0; int64_t cur_dim1 = 0; // Copy each tensor into the appropriate location in the result matrix for (const auto& tensor : tensors_2D) { int64_t dim0 = tensor.size(0); int64_t dim1 = tensor.size(1); result.slice(0, cur_dim0, cur_dim0+dim0).slice(1, cur_dim1, cur_dim1+dim1).copy_(tensor); cur_dim0 += dim0; cur_dim1 += dim1; } return result; } std::vector chunk(const Tensor& self, int64_t chunks, int64_t dim) { TORCH_CHECK(self.dim() > 0, "chunk expects at least a 1-dimensional tensor"); TORCH_CHECK(chunks > 0, "chunk expects `chunks` to be greater than 0, got: ", chunks); const auto dim_size = self.sym_size(dim); auto split_size = (dim_size + chunks - 1) / chunks; // We need to call split_with_sizes in the case where split_size and dimension size are 0, because // a call to split would discard the number of chunks (because we can have an arbitrary number of // 0-sized chunks adding up to 0). So, call split_with_sizes with the correct number of chunks, // eventually we will do this for all cases. if (split_size == 0 && dim_size == 0) { std::vector split_sizes(chunks, split_size); split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size); return self.split_with_sizes_symint(split_sizes, dim); } else { return self.split_symint(std::move(split_size), dim); } } std::vector tensor_split_sections_symint(const Tensor& self, c10::SymInt sym_sections, int64_t dim) { TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims"); int64_t dim_ = maybe_wrap_dim(dim, self.dim()); // NB: intentional, sections specifies number of output tensors, which // cannot be polymorphic int64_t sections = sym_sections.guard_int(__FILE__, __LINE__); TORCH_CHECK(sections > 0, "number of sections must be larger than 0, got ", sections); const auto dim_size = self.sym_size(dim_); std::vector splits(sections); auto min_split_size = dim_size / sections; auto num_splits_one_extra = dim_size % sections; c10::SymInt start_idx = 0; for (const auto split_idx : c10::irange(sections)) { auto split_size = (num_splits_one_extra > split_idx) ? (min_split_size + 1) : min_split_size; splits[split_idx] = at::slice_symint(self, dim_, start_idx, start_idx + split_size); start_idx += split_size; } return splits; } template std::vector _tensor_split_indices(const Tensor& self, ArrayRef indices, int64_t dim) { TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims"); int64_t dim_ = maybe_wrap_dim(dim, self.dim()); int64_t num_indices = indices.size(); std::vector splits(num_indices + 1); T start_idx(0); for (const auto split_idx : c10::irange(num_indices)) { auto end_idx = indices[split_idx]; splits[split_idx] = at::symint::slice(self, dim_, start_idx, end_idx); start_idx = end_idx; } splits[num_indices] = at::symint::slice(self, dim_, start_idx, at::symint::size(self, dim_)); return splits; } std::vector tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) { return _tensor_split_indices(self, indices, dim); } std::vector tensor_split_indices_symint(const Tensor& self, SymIntArrayRef indices, int64_t dim) { return _tensor_split_indices(self, indices, dim); } std::vector tensor_split(const Tensor& self, const Tensor& tensor_indices_or_sections, int64_t dim) { TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims"); auto split_device = tensor_indices_or_sections.device(); TORCH_CHECK(split_device == kCPU, "tensor_split expected tensor_indices_or_sections to be on cpu, but it's on ", split_device); auto split_dtype = tensor_indices_or_sections.scalar_type(); TORCH_CHECK(split_dtype == at::kLong, "tensor_split expected tensor_indices_or_sections to have dtype of long, but got ", split_dtype); auto split_dim = tensor_indices_or_sections.dim(); TORCH_CHECK(split_dim == 1 || split_dim == 0, "tensor_split expected tensor_indices_or_sections to be a zero-dimensional or one-dimensional tensor, but got a tensor with ", split_dim, " dims"); if (split_dim == 0) { int64_t sections = tensor_indices_or_sections.item(); return self.tensor_split(sections, dim); } else { auto indices_data = tensor_indices_or_sections.const_data_ptr(); auto stride = tensor_indices_or_sections.stride(0); auto numel = tensor_indices_or_sections.numel(); std::vector indices(numel); for (const auto offset : c10::irange(numel)) { // indices tensor could be non-contiguous indices[offset] = *(indices_data + offset * stride); } return self.tensor_split(indices, dim); } } std::vector unsafe_chunk(const Tensor& self, int64_t chunks, int64_t dim) { TORCH_CHECK(self.dim() > 0, "chunk expects at least a 1-dimensional tensor"); TORCH_CHECK(chunks > 0, "chunk expects `chunks` to be greater than 0, got: ", chunks); const auto dim_size = self.size(dim); int64_t split_size = (dim_size + chunks - 1) / chunks; // See the comment above in chunk(...) if (split_size == 0 && dim_size == 0) { std::vector split_sizes(chunks, split_size); split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size); return self.unsafe_split_with_sizes(split_sizes, dim); } else { return self.unsafe_split(split_size, dim); } } Tensor diagflat(const Tensor& self, int64_t offset) { return self.contiguous().view(-1).diag(offset); } Tensor diagonal(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_) { int64_t nDims = self.dim(); int64_t dim1 = maybe_wrap_dim(dim1_, nDims); int64_t dim2 = maybe_wrap_dim(dim2_, nDims); TORCH_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_); auto outnames = namedinference::compute_diagonal_outnames(self, dim1, dim2); NoNamesGuard no_names_guard; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t diag_size; int64_t storage_offset = self.storage_offset(); // compute storage offset and size for the diagonal // for positive values of offset (above the main diagonal) // "leftmost columns" (along dim2) are dropped // for negative values of offset (below the main diagonal) // "topmost rows" (along dim1) are dropped. // Note that we invert +/- in the second to absorb the negative // sign in the offset. if (offset >= 0) { diag_size = std::max(std::min(self.size(dim1), self.size(dim2)-offset), 0); } else { diag_size = std::max(std::min(self.size(dim1)+offset, self.size(dim2)), 0); } // NumPy allows you to specify offsets "off the end"; let's just be careful not to // set a ridiculous storage_offset in that case (technically it shouldn't matter // because there are no elements in the tensor, but let's be kosher). if (diag_size == 0) { // skip } else if (offset >= 0) { storage_offset += offset * self.stride(dim2); } else { storage_offset -= offset * self.stride(dim1); } // construct new size and stride: we drop dim1 and dim2 (maximum first for not changing the index of the minimum) // the new ("joint") dimension is appended to the end of the shape / stride to match numpy semantics DimVector sizes(self.sizes().begin(), self.sizes().end()); DimVector strides(self.strides().begin(), self.strides().end()); sizes.erase(sizes.begin() + std::max(dim1, dim2)); strides.erase(strides.begin() + std::max(dim1, dim2)); sizes.erase(sizes.begin() + std::min(dim1, dim2)); strides.erase(strides.begin() + std::min(dim1, dim2)); sizes.push_back(diag_size); strides.push_back(self.stride(dim1)+self.stride(dim2)); // return view with new parameters auto result = self.as_strided(sizes, strides, storage_offset); no_names_guard.reset(); namedinference::propagate_names_if_nonempty(result, outnames); return result; } Tensor diagonal(const Tensor& self, Dimname outdim, Dimname dim1, Dimname dim2, int64_t offset) { auto result = at::diagonal( self, offset, dimname_to_position(self, dim1), dimname_to_position(self, dim2)); // This is slower than it needs to be because there is no way to modify // the names of a tensor in-place right now. In the future we should consider // offering that functionality. std::vector new_names = result.names().vec(); new_names[new_names.size() - 1] = outdim; return result.refine_names(new_names); } Tensor diag_embed(const Tensor& self, int64_t offset, int64_t dim1_, int64_t dim2_) { int64_t nDims = self.dim() + 1; int64_t dim1 = maybe_wrap_dim(dim1_, nDims); int64_t dim2 = maybe_wrap_dim(dim2_, nDims); TORCH_CHECK(dim1 != dim2, "diagonal dimensions cannot be identical ", dim1_, ", ", dim2_); int64_t new_dim_len = std::abs(offset) + self.size(-1); auto sizes = self.sizes().vec(); sizes.pop_back(); sizes.insert(sizes.begin() + std::min(dim1, dim2), new_dim_len); sizes.insert(sizes.begin() + std::max(dim1, dim2), new_dim_len); auto result = at::zeros(sizes, self.options()); auto diag = result.diagonal(offset, dim1, dim2); diag.copy_(self); return result; } Tensor expand(const Tensor& self, c10::IntArrayRef size, bool /*unused*/) { TORCH_CHECK(size.size() >= (size_t)self.dim(), "expand(", self.toString(), "{", self.sizes(), "}, size=", size, "): the number of sizes provided (", size.size(), ") ", "must be greater or equal to the number of dimensions in the tensor (", self.dim(), ")"); TORCH_CHECK(!self.is_sparse() && !at::sparse_csr::is_sparse_compressed(self), "expand is unsupported for ", self.layout(), " tensors"); auto expandedSizesAndStrides = inferExpandGeometry_dimvector(self.sizes(), self.strides(), size); auto result = self.as_strided( expandedSizesAndStrides.sizes, expandedSizesAndStrides.strides); namedinference::propagate_names_for_expand(result, self); return result; } Tensor expand_as(const Tensor& self, const Tensor& other) { return self.expand_symint(other.sym_sizes()); } Tensor sum_to_size_symint(const Tensor& self, SymIntArrayRef size) { TORCH_CHECK(is_expandable_to(size, self.sym_sizes()), "size {", size, "} is not expandable to size {", self.sizes(), "}."); return sum_to(self, size); } // We currently do not support per-channel quant for unfold, diagonal, expand, permute. // TODO: Make this an aten function and replace as_strided_qtensorimpl once that is done. static Tensor make_qtensor(const Tensor& self, IntArrayRef size, IntArrayRef stride, QuantizerPtr quantizer) { auto result = at::detail::make_tensor( c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype(), quantizer); setStrided(result, size, stride, self.storage_offset()); return result; } Tensor as_strided_tensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, std::optional storage_offset_) { TORCH_INTERNAL_ASSERT(!self.is_mps(), "as_strided_tensorimpl does not work with MPS; call self.as_strided(...) instead"); auto storage_offset = storage_offset_.value_or(self.storage_offset()); auto result = at::detail::make_tensor( c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype()); setStrided(result, size, stride, storage_offset); return result; } template inline void setStridedUnchecked( const Tensor& self, ArrayRef size, ArrayRef stride, T&& storage_offset) { auto* self_ = self.unsafeGetTensorImpl(); self_->set_sizes_and_strides(size, stride, std::make_optional(std::forward(storage_offset))); } Tensor as_strided_tensorimpl_meta_symint(const Tensor& self, SymIntArrayRef sym_size, SymIntArrayRef sym_stride, std::optional sym_storage_offset_) { auto sym_storage_offset = sym_storage_offset_.value_or(self.sym_storage_offset()); auto result = at::detail::make_tensor( c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype()); // NB: The reason this is unchecked is to ensure we don't generate // guards on the base storage itself when performing as_strided calls. // Although technically these guards are necessary, in practice they // cause a lot of guards that falsely refer to base symbols. We will instead // rely on AOTAutograd to sort out if we actually have dependence on view // bases / storage size. setStridedUnchecked(result, sym_size, sym_stride, std::move(sym_storage_offset)); return result; } Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, std::optional storage_offset_) { auto storage_offset = storage_offset_.value_or(self.storage_offset()); auto quantizer = get_qtensorimpl(self)->quantizer(); TORCH_CHECK( quantizer->qscheme() == QScheme::PER_TENSOR_AFFINE, "Setting strides is possible only on uniformly quantized tensor"); auto result = at::detail::make_tensor( c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype(), quantizer); setStrided(result, size, stride, storage_offset); return result; } // This is an overloaded function similar to // Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, std::optional storage_offset_) // and is currently not available through the dispatcher. The additional // input, quantizer, is called by the select & slice methods. // TODO: Make this function compatible with the dispatcher static Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef stride, std::optional storage_offset_, QuantizerPtr quantizer) { auto storage_offset = storage_offset_.value_or(self.storage_offset()); TORCH_CHECK( (quantizer->qscheme() == QScheme::PER_TENSOR_AFFINE) || (quantizer->qscheme() == QScheme::PER_CHANNEL_AFFINE), "Setting strides is possible only on uniformly or per channel quantized tensors"); auto result = at::detail::make_tensor( c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype(), quantizer); setStrided(result, size, stride, storage_offset); return result; } const Tensor &as_strided__symint(const Tensor& self, SymIntArrayRef size, SymIntArrayRef stride, std::optional storage_offset_) { auto storage_offset = storage_offset_.value_or(self.sym_storage_offset()); setStrided(self, size, stride, std::move(storage_offset)); return self; } // Should just use narrow_copy_out, but this API is used internally at Meta: // https://github.com/pytorch/pytorch/pull/87045#issuecomment-1309353561 Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){ // narrow_copy_dense_cpu_out always resize output's size, so there only create // a zero size tensor. auto output = at::empty({0}, self.options()); return narrow_copy_dense_cpu_out(self, dim, start, length, output); } Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) { int64_t allDim = self.dim(); int64_t end = start+length; TORCH_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor."); TORCH_CHECK(length >= 0, "narrow(): length must be non-negative."); TORCH_CHECK(dim >= 0 && dim < allDim, "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, "."); TORCH_CHECK(start >= 0 && end <= self.size(dim), "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").") Tensor indices = self._indices(); int64_t sparse_dim = self.sparse_dim(); std::vector new_sizes = self.sizes().vec(); new_sizes[dim] = length; Tensor new_values; Tensor new_indices; if (dim < sparse_dim) { Tensor mask = (indices[dim] >= start).__and__((indices[dim] < end)); new_indices = indices.masked_select(mask).view({sparse_dim, -1}); new_indices[dim].sub_(start); Tensor nzIndices = mask.nonzero().view(-1); new_values = self._values().index_select(0, nzIndices); } else { /* This means we are narrowing on a dense dim, which is in effect just a regular narrow on _values() */ new_indices = indices; int64_t dense_dim = dim - sparse_dim + 1; new_values = self._values().narrow_copy(dense_dim, start, length); } return at::sparse_coo_tensor(new_indices, new_values, new_sizes, self.options(), self.is_coalesced()); } // Should just use narrow_copy_out, but this API is used internally at Meta: // https://github.com/pytorch/pytorch/pull/87045#issuecomment-1309353561 Tensor& narrow_copy_dense_cpu_out( const Tensor& self, int64_t dim, int64_t start, int64_t length, Tensor& output ) { TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); TORCH_CHECK(self.dtype() == output.dtype()); auto self_contig = self.expect_contiguous(); const auto self_sizes = self_contig->sizes(); // wrap dim if negative and do bound check if (dim < 0) { dim = at::maybe_wrap_dim(dim, self_sizes.size()); } else { TORCH_CHECK(dim < static_cast(self_sizes.size())); } // wrap start and do bound check const auto cur_size = self_sizes[dim]; TORCH_CHECK_INDEX( -cur_size <= start && start <= cur_size, "start out of range (expected to be in range of [", -cur_size, ", ", cur_size, "], but got ", start, ")" ) if (start < 0) { start = start + cur_size; } TORCH_CHECK( length >= 0 && start <= cur_size - length, "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); // resize output auto output_sizes = self_sizes.vec(); output_sizes[dim] = length; at::native::resize_(output, output_sizes); // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) const int64_t unit = c10::size_from_dim_(dim + 1, self_sizes); const int64_t num_blocks = c10::size_to_dim_(dim, self_sizes); const auto itemsize = self_contig->dtype().itemsize(); // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) size_t src_nbytes = itemsize * self_contig->numel(); // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) size_t dst_nbytes = itemsize * output.numel(); size_t src_block_size = unit * self_sizes[dim]; size_t dst_block_size = unit * length; if (num_blocks == 0 || dst_block_size == 0) { return output; } const char* src_bytes = static_cast(self_contig->const_data_ptr()); char* dst_bytes = static_cast(output.data_ptr()); size_t src_block_size_bytes = itemsize * src_block_size; size_t dst_block_size_bytes = itemsize * dst_block_size; size_t src_offset = unit * start; const char* src_offset_bytes = src_bytes + itemsize * src_offset; char* dst_offset_bytes = dst_bytes; for (const auto i : c10::irange(num_blocks)) { const char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes; char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes; TORCH_INTERNAL_ASSERT_DEBUG_ONLY( static_cast(local_src_offset_bytes + dst_block_size_bytes) <= static_cast(src_bytes + src_nbytes)); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( static_cast(local_dst_offset_bytes + dst_block_size_bytes) <= static_cast(dst_bytes + dst_nbytes)); memcpy( local_dst_offset_bytes, local_src_offset_bytes, dst_block_size_bytes); } return output; } Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) { TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); TORCH_CHECK(length >= 0, "narrow(): length must be non-negative."); auto cur_size = self.size(dim); TORCH_CHECK_INDEX( -cur_size <= start && start <= cur_size, "start out of range (expected to be in range of [", -cur_size, ", ", cur_size, "], but got ", start, ")" ) if (start < 0) { start = start + cur_size; } TORCH_CHECK(start <= cur_size - length, "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); return at::slice(self, dim, start, start + length, 1); } Tensor narrow_symint(const Tensor& self, int64_t dim, SymInt start, SymInt length) { TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); TORCH_SYM_CHECK(length.sym_ge(0), "narrow(): length must be non-negative."); auto cur_size = self.sym_size(dim); TORCH_CHECK_INDEX( ((-cur_size).sym_le(start).sym_and(start.sym_le(cur_size))).expect_true(__FILE__, __LINE__), "start out of range (expected to be in range of [", -cur_size, ", ", cur_size, "], but got ", start, ")" ) if (start < 0) { start = start + cur_size; } TORCH_SYM_CHECK(start.sym_le(cur_size - length), "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ")."); return at::slice_symint(self, dim, start, start + length, 1); } // This overload exists purely for XLA, because they wanted to pass in "symbolic" // start via Tensor. Tensor narrow_tensor_symint(const Tensor& self, int64_t dim, const Tensor& start, SymInt length) { TORCH_CHECK(start.dim() == 0 && isIntegralType(start.scalar_type(), /*includeBool=*/false), "start must be an 0-dim integral Tensor."); int64_t st = start.item(); return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length)); } std::tuple> static _permute_size_stride_estimation(const Tensor& self, IntArrayRef dims) { const auto ndim = self.dim(); TORCH_CHECK(ndim == static_cast(dims.size()), "permute(sparse_coo): number of dimensions in the tensor input ", "does not match the length of the desired ordering of dimensions ", "i.e. input.dim() = ", ndim, " is not equal to len(dims) = ", dims.size()); const auto is_strided_layout = self.options().layout() == at::kStrided; const auto old_sizes = self.sizes(); const auto old_strides = is_strided_layout ? self.strides() : IntArrayRef{}; auto new_sizes = DimVector(ndim); auto new_strides = DimVector(is_strided_layout ? ndim : 0); auto wrapped_dims = std::vector(ndim); std::vector seen_dims(ndim); for (const auto i : c10::irange(ndim)) { const auto d = maybe_wrap_dim(dims[i], ndim); TORCH_CHECK(!seen_dims[d], "permute(): duplicate dims are not allowed."); seen_dims[d] = true; wrapped_dims[i] = d; new_sizes[i] = old_sizes[d]; if (is_strided_layout) { new_strides[i] = old_strides[d]; } } return std::make_tuple(new_sizes, new_strides, wrapped_dims); } Tensor permute(const Tensor& self, IntArrayRef dims) { auto [new_sizes, new_strides, _] = _permute_size_stride_estimation(self, dims); return self.as_strided(new_sizes, new_strides); } Tensor permute_sparse_coo(const Tensor& self, IntArrayRef dims) { auto [new_sizes, _, wrapped_dims] = _permute_size_stride_estimation(self, dims); const auto ndim = self.dim(); const auto sparse_ndim = self.sparse_dim(); const auto dense_ndim = self.dense_dim(); auto dims_id_perm = std::vector(ndim); auto dims_sparse_dense_id_perm = std::vector(ndim); for (const auto i : c10::irange(ndim)) { dims_id_perm[i] = i; dims_sparse_dense_id_perm[i] = wrapped_dims[i]; } std::sort(dims_sparse_dense_id_perm.begin(), dims_sparse_dense_id_perm.begin() + sparse_ndim); std::sort(dims_sparse_dense_id_perm.begin() + sparse_ndim, dims_sparse_dense_id_perm.end()); TORCH_CHECK(dims_sparse_dense_id_perm == dims_id_perm, "permute(sparse_coo): transpositions between sparse and dense dimensions are not allowed.", "Only transpositions within sparse and dense dimensions are supported."); const auto slice = [](std::vector v, size_t begin, size_t len) -> decltype(v) { return std::vector{v.begin() + begin, v.begin() + begin + len}; }; auto old_sparse_dims = slice(dims_id_perm, 0, sparse_ndim); auto old_dense_dims = slice(std::move(dims_id_perm), sparse_ndim, ndim - sparse_ndim); auto new_sparse_dims = slice(wrapped_dims, 0, sparse_ndim); auto new_dense_dims = slice(std::move(wrapped_dims), sparse_ndim, ndim - sparse_ndim); auto old_indices = self._indices(); auto old_values = self._values(); const auto new_indices = (new_sparse_dims == old_sparse_dims) ? std::move(old_indices) : [&]() -> Tensor { auto sparse_perm_tensor = at::from_blob(reinterpret_cast(new_sparse_dims.data()), {sparse_ndim}, old_indices.options().device(at::kCPU)); // creates new indices. It is possible to avoid that if COO // is allowed to store a permutation vector. return old_indices.index_select(0, sparse_perm_tensor.to(self.device().type())); }(); const auto new_values = (new_dense_dims == old_dense_dims) ? std::move(old_values) : [&]() -> Tensor { auto values_perm = std::vector(dense_ndim + 1); for (const auto i : c10::irange(dense_ndim)) { values_perm[i + 1] = new_dense_dims[i] - sparse_ndim + 1; } return old_values.permute(values_perm); }(); const auto is_coalesced = self.is_coalesced() && (dims.empty() || dims[0] == 0); // TODO: apply `is_coalesced ||= new_values.size(0) < 2`. return _sparse_coo_tensor_with_dims_and_tensors( sparse_ndim, dense_ndim, new_sizes, new_indices, new_values, self.options(), is_coalesced); } Tensor repeat(const Tensor& self, IntArrayRef repeats) { TORCH_CHECK(repeats.size() >= (size_t)self.dim(), "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor"); // Add new leading dimensions to the tensor if the // number of target dimensions is larger than the // number of source dimensions. int64_t num_new_dimensions = repeats.size() - self.dim(); DimVector padded_size(num_new_dimensions, 1); padded_size.insert(padded_size.end(), self.sizes().begin(), self.sizes().end()); DimVector target_size(repeats.size()); bool zero_tensor = false; for(const auto idx : c10::irange(repeats.size())) { if (repeats[idx] == 0) { zero_tensor = true; } target_size[idx] = padded_size[idx] * repeats[idx]; } Tensor xtensor = self.expand(padded_size); Tensor result; if (self.is_quantized()) { result = at::empty_quantized(target_size, self); } else { result = at::empty(target_size, self.options()); } // return an empty tensor if one of the repeat dimensions is zero if (zero_tensor) { return result; } Tensor urtensor = at::alias(result); for (const auto i : c10::irange(xtensor.dim())) { // can't unfold with step 0, so make sure step is at least 1 // (it doesn't matter what it is in that case, because the size is 0). auto size_i = xtensor.sizes()[i]; urtensor = urtensor.unfold(i, size_i, std::max(size_i, 1)); } urtensor.copy_(xtensor.expand_as(urtensor)); return result; } Tensor tile_symint(const Tensor& self, SymIntArrayRef reps){ // If self.size() > len(reps), reps is promoted to self.size() by pre-pending // 1’s to it to keep the same behaviour as `numpy.tile`. // Thus for a tensor of shape (2, 3, 4, 5), a dims of (2, 2) is treated // as (1, 1, 2, 2). const int64_t size_diff = self.dim() - static_cast(reps.size()); if (size_diff > 0){ std::vector new_reps(size_diff, 1); for (const auto i : c10::irange(reps.size())) { new_reps.emplace_back(reps[i]); } return self.repeat_symint(SymIntArrayRef(new_reps)); } // `torch.tile` is equivalent to the already implemented `torch.Tensor.repeat` return self.repeat_symint(reps); } // // templated for ArrayRef and SmallVector use cases // template Tensor alias_with_sizes_and_strides( const Tensor& self, const Vec& sizes, const Vec& strides) { //caller should make sure that sizes and strides are valid for self //(storage is sufficient, strides are non-negative, strides and sizes array size is the same) Tensor self_; if (self.is_quantized()) { self_ = at::detail::make_tensor( c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype(), get_qtensorimpl(self)->quantizer()); auto* self_tmp_ = self_.unsafeGetTensorImpl(); self_tmp_->set_storage_offset(self.storage_offset()); self_tmp_->set_sizes_and_strides(sizes, strides); } else { self_ = at::detail::make_tensor( c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype()); auto* self_tmp_ = self_.unsafeGetTensorImpl(); self_tmp_->set_storage_offset(self.storage_offset()); self_tmp_->set_sizes_and_strides(sizes, strides); } namedinference::propagate_names(self_, self); return self_; } // specialization for symbolic shapes and strides. // SymIntArrayRef/ArrayRef and SmallVector/SymDimVector template