1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
10
11 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
12 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
13 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
14 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
15
16 namespace vkcompute {
17
18 using utils::ivec3;
19 using utils::uvec3;
20
add_copy_offset_node(ComputeGraph & graph,const ValueRef in,const ivec3 & range,const ivec3 & src_offset,const ivec3 & dst_offset,const ValueRef out)21 void add_copy_offset_node(
22 ComputeGraph& graph,
23 const ValueRef in,
24 const ivec3& range,
25 const ivec3& src_offset,
26 const ivec3& dst_offset,
27 const ValueRef out) {
28 vTensorPtr t_in = graph.get_tensor(in);
29 vTensorPtr t_out = graph.get_tensor(out);
30
31 std::string kernel_name = "copy_offset";
32 kernel_name.reserve(kShaderNameReserve);
33 add_dtype_suffix(kernel_name, *t_out);
34 add_storage_type_suffix(kernel_name, *t_out);
35
36 const struct Block final {
37 alignas(16) ivec3 range;
38 alignas(16) ivec3 src_offset;
39 alignas(16) ivec3 dst_offset;
40 } offset_params{
41 range,
42 src_offset,
43 dst_offset,
44 };
45
46 auto shader = VK_KERNEL_FROM_STR(kernel_name);
47
48 graph.execute_nodes().emplace_back(new DispatchNode(
49 graph,
50 VK_KERNEL_FROM_STR(kernel_name),
51 graph.create_global_wg_size(out),
52 graph.create_local_wg_size(out),
53 // Inputs and Outputs
54 {
55 {out, vkapi::kWrite},
56 {in, vkapi::kRead},
57 },
58 // Parameter buffers
59 {
60 graph.create_params_buffer(offset_params),
61 },
62 // Specialization Constants
63 {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}));
64 }
65
add_copy_channel_offset_node(ComputeGraph & graph,const ValueRef in,int32_t channel_range,int32_t src_channel_offset,int32_t dst_channel_offset,const ValueRef out)66 void add_copy_channel_offset_node(
67 ComputeGraph& graph,
68 const ValueRef in,
69 int32_t channel_range,
70 int32_t src_channel_offset,
71 int32_t dst_channel_offset,
72 const ValueRef out) {
73 vTensorPtr t_in = graph.get_tensor(in);
74 vTensorPtr t_out = graph.get_tensor(out);
75
76 // Likely need to prepad these numbers.
77 std::vector<int64_t> in_sizes = t_in->sizes();
78 std::vector<int64_t> out_sizes = t_out->sizes();
79
80 VK_CHECK_COND(check_packed_dim_is(*t_in, WHCN::kChannelsDim));
81 VK_CHECK_COND(check_packed_dim_is(*t_out, WHCN::kChannelsDim));
82
83 // NOTE: This function should be able to support 1d and 2d tensors when
84 // range=1, src_offset=dst_offset=1.
85 VK_CHECK_COND(t_in->dim() >= 3, "Src dim should be at least 3");
86 VK_CHECK_COND(t_out->dim() >= 3, "Dst dim should be at least 3");
87
88 VK_CHECK_COND(
89 dim_at<kChannel4D>(in_sizes) >= src_channel_offset + channel_range,
90 "Src channel (",
91 src_channel_offset,
92 ") and range (",
93 channel_range,
94 ") should be less than or equal to input tensor's channel size (",
95 dim_at<kChannel4D>(in_sizes),
96 ")");
97
98 VK_CHECK_COND(
99 dim_at<kChannel4D>(out_sizes) >= dst_channel_offset + channel_range,
100 "Dst channel (",
101 dst_channel_offset,
102 ") and range (",
103 channel_range,
104 ") should be less than or equal to input tensor's channel size (",
105 dim_at<kChannel4D>(out_sizes),
106 ")");
107
108 VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
109 VK_CHECK_COND(
110 src_channel_offset >= 0, "Src channel offset must be non-negative");
111 VK_CHECK_COND(
112 dst_channel_offset >= 0, "Dst channel offset must be non-negative");
113
114 std::string kernel_name = "copy_channel_offset";
115 kernel_name.reserve(kShaderNameReserve);
116 add_dtype_suffix(kernel_name, *t_out);
117
118 int32_t out_channels = dim_at<kChannel4D>(out_sizes);
119
120 // Copy one batch at a time.
121 for (int batch_idx = 0; batch_idx < dim_at<kBatch4D>(in_sizes); batch_idx++) {
122 // Mapping the tensor NCHW coordinates into texture XYZ coordinates
123 int32_t dst_first_z = dst_channel_offset / 4;
124 int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
125
126 // We copy the entire width and height dimension. For the channel dimension,
127 // we use the z-dimension of the global_size to specify the texture range.
128 // The shader combines the global invocation id and the dst_offset to get
129 // the actual coordinate.
130
131 ivec3 dst_offset{
132 0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)};
133
134 uvec3 global_size{
135 utils::safe_downcast<uint32_t>(dim_at<kWidth4D>(in_sizes)),
136 utils::safe_downcast<uint32_t>(dim_at<kHeight4D>(in_sizes)),
137 utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
138 uvec3 local_size = adaptive_work_group_size(global_size);
139
140 const struct Block final {
141 ivec3 range;
142 int32_t channel_range;
143 ivec3 dst_offset;
144 int32_t dst_channel_offset;
145 int32_t src_channel_offset;
146 } channel_offset_params{
147 utils::make_ivec3(global_size),
148 channel_range,
149 dst_offset,
150 dst_channel_offset,
151 src_channel_offset,
152 };
153
154 auto shader = VK_KERNEL_FROM_STR(kernel_name);
155
156 graph.execute_nodes().emplace_back(new DispatchNode(
157 graph,
158 VK_KERNEL_FROM_STR(kernel_name),
159 global_size,
160 local_size,
161 // Inputs and Outputs
162 {
163 {out, vkapi::MemoryAccessType::WRITE},
164 {out, vkapi::MemoryAccessType::READ},
165 {in, vkapi::MemoryAccessType::READ},
166 },
167 // Parameter buffers
168 {
169 t_out->sizes_ubo(),
170 t_in->sizes_ubo(),
171 graph.create_params_buffer(channel_offset_params),
172 },
173 // Specialization Constants
174 {graph.hashed_layout_of(out), graph.hashed_layout_of(in)}));
175 }
176 }
177
add_copy_offset_node(ComputeGraph & graph,ValueRef in,ValueRef range_ref,ValueRef src_offset_ref,ValueRef dst_offset_ref,ValueRef out)178 void add_copy_offset_node(
179 ComputeGraph& graph,
180 ValueRef in,
181 ValueRef range_ref,
182 ValueRef src_offset_ref,
183 ValueRef dst_offset_ref,
184 ValueRef out) {
185 ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref));
186 ivec3 src_offset = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
187 ivec3 dst_offset = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
188
189 add_copy_offset_node(graph, in, range, src_offset, dst_offset, out);
190 }
191
copy_offset(ComputeGraph & graph,const std::vector<ValueRef> & args)192 void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
193 add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
194 }
195
copy_channel_offset(ComputeGraph & graph,const std::vector<ValueRef> & args)196 void copy_channel_offset(
197 ComputeGraph& graph,
198 const std::vector<ValueRef>& args) {
199 ValueRef in = args[0];
200 ValueRef channel_range_ref = args[1];
201 ValueRef src_channel_offset_ref = args[2];
202 ValueRef dst_channel_offset_ref = args[3];
203 ValueRef out = args[4];
204
205 auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
206 auto src_channel_offset =
207 graph.extract_scalar<int64_t>(src_channel_offset_ref);
208 auto dst_channel_offset =
209 graph.extract_scalar<int64_t>(dst_channel_offset_ref);
210
211 add_copy_channel_offset_node(
212 graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
213 }
214
215 REGISTER_OPERATORS {
216 VK_REGISTER_OP(etvk.copy_offset, copy_offset);
217 VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
218 }
219
220 } // namespace vkcompute
221