#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include #include #ifndef AT_PER_OPERATOR_HEADERS #include #include #else #include #include #include #include #include #include #include #include #include #include #include #include #endif #include #include #include namespace at::native { static void layer_norm_with_mean_rstd_out( at::Tensor& out, at::Tensor& mean, at::Tensor& rstd, const at::Tensor& input, IntArrayRef normalized_shape, const Tensor& gamma, const Tensor& beta, double eps, int64_t M, int64_t N) { LayerNormKernel(kCPU, input, gamma, beta, M, N, eps, &out, &mean, &rstd); const auto input_shape = input.sizes(); const size_t axis = input.dim() - normalized_shape.size(); DimVector stat_shape; for (const auto idx : c10::irange(axis)) { stat_shape.emplace_back(input_shape[idx]); } for (const auto idx C10_UNUSED : c10::irange(axis, input.dim())) { stat_shape.emplace_back(1); } mean = mean.view(stat_shape); rstd = rstd.view(stat_shape); } void layer_norm_cpu_out( at::Tensor& out, const at::Tensor& input, const Tensor& gamma, const Tensor& beta, double eps, int64_t M, int64_t N) { if (M <= 0) { return; } LayerNormKernel(kCPU, input, gamma, beta, M, N, eps, &out, /*mean=*/nullptr, /*rstd=*/nullptr); } std::tuple layer_norm_cpu( const Tensor& input, IntArrayRef normalized_shape, const std::optional& weight_opt /* optional */, const std::optional& bias_opt /* optional */, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; bool mixed_type = is_mixed_type(input, weight, bias); if (mixed_type) { check_mixed_data_type(input, weight, bias); } auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias); auto M = M_N.first; auto N = M_N.second; auto X = input.expect_contiguous(); auto gamma = weight.expect_contiguous(); auto beta = bias.expect_contiguous(); Tensor Y = at::native::empty_like( *X, std::nullopt /* dtype */, std::nullopt /* layout */, std::nullopt /* device */, std::nullopt /* pin_memory */, at::MemoryFormat::Contiguous); const auto dtype = param_scalar_type(input, mixed_type); Tensor mean = at::empty({M}, X->options().dtype(dtype)); Tensor rstd = at::empty({M}, X->options().dtype(dtype)); layer_norm_with_mean_rstd_out(Y, mean, rstd, *X, normalized_shape, *gamma, *beta, eps, M, N); return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd)); } std::tuple layer_norm_backward_cpu( const Tensor& dY, const Tensor& input, IntArrayRef normalized_shape, const Tensor& mean, const Tensor& rstd, const std::optional& weight_opt /* optional */, const std::optional& bias_opt /* optional */, std::array grad_input_mask) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias); auto M = M_N.first; auto N = M_N.second; auto X = input.expect_contiguous(); auto gamma = weight.expect_contiguous(); auto beta = bias.expect_contiguous(); Tensor dX; Tensor dgamma; Tensor dbeta; if (grad_input_mask[0]) { dX = at::native::empty_like( *X, std::nullopt /* dtype */, std::nullopt /* layout */, std::nullopt /* device */, std::nullopt /* pin_memory */, at::MemoryFormat::Contiguous); } if (grad_input_mask[1]) { dgamma = M > 0 ? at::native::empty_like( *gamma, std::nullopt /* dtype */, std::nullopt /* layout */, std::nullopt /* device */, std::nullopt /* pin_memory */, at::MemoryFormat::Contiguous) : at::native::zeros_like( *gamma, std::nullopt /* dtype */, std::nullopt /* layout */, std::nullopt /* device */, std::nullopt /* pin_memory */, at::MemoryFormat::Contiguous); } if (grad_input_mask[2]) { dbeta = M > 0 ? at::native::empty_like( *beta, std::nullopt /* dtype */, std::nullopt /* layout */, std::nullopt /* device */, std::nullopt /* pin_memory */, at::MemoryFormat::Contiguous) : at::native::zeros_like( *beta, std::nullopt /* dtype */, std::nullopt /* layout */, std::nullopt /* device */, std::nullopt /* pin_memory */, at::MemoryFormat::Contiguous); } if (M > 0) { LayerNormBackwardKernel( kCPU, dY, *X, mean, rstd, *gamma, M, N, &dX, &dgamma, &dbeta); } return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta)); } Tensor layer_norm_symint( const Tensor& input, c10::SymIntArrayRef normalized_shape, const std::optional& weight_opt /* optional */, const std::optional& bias_opt /* optional */, double eps, bool /* cudnn_enable, deprecated */) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; return std::get<0>(at::native_layer_norm_symint(input, normalized_shape, weight, bias, eps)); } DEFINE_DISPATCH(LayerNormKernel); DEFINE_DISPATCH(LayerNormBackwardKernel); // Ported from pytorch/xla repo std::tuple math_native_layer_norm( const Tensor& input, IntArrayRef normalized_shape, const std::optional& weight_opt, const std::optional& bias_opt, double eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias); auto M = M_N.first; auto X = input.expect_contiguous(); auto gamma = weight.expect_contiguous(); auto input_shape = input.sizes(); const auto input_ndim = input.dim(); const int normalized_ndim = normalized_shape.size(); // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) const int axis = input_ndim - normalized_ndim; // Properly handle zero-size inputs: the view(1, M, -1) call below breaks on this. if (input.numel() == 0) { auto result_type = c10::promoteTypes(input.scalar_type(), kFloat); return std::make_tuple( at::empty_like(input), at::empty_like(input, c10::TensorOptions().dtype(result_type)), at::empty_like(input, c10::TensorOptions().dtype(result_type)) ); } at::Tensor input_reshaped = input.reshape({1, M, -1}); // Unlike Batch Normalization, which applies scalar scale and bias for each // entire channel/plane with the affine option, Layer Normalization applies // per-element scale and bias. E.g. For input {N, C, H, W}, weight for // batchnorm has shape {C} while weight for layernorm has shape {H, W} or {W}. auto outputs = at::native_batch_norm( input_reshaped, /*weight=*/{}, /*bias=*/{}, /*running_mean=*/{}, /*running_var=*/{}, /*training=*/true, /*momentum=*/0, eps); at::Tensor out = std::get<0>(outputs); out = out.view(input_shape); if (weight.defined() && bias.defined()) { out = bias.addcmul(out, weight, 1); } else if (weight.defined()) { out = out.mul(weight); } else if (bias.defined()) { out = out.add(bias); } at::Tensor mean = std::get<1>(outputs); at::Tensor rstd = std::get<2>(outputs); std::vector stat_shape; for (const auto idx : c10::irange(axis)) { stat_shape.push_back(input_shape[idx]); } for (const auto idx C10_UNUSED : c10::irange(axis, input.dim())) { stat_shape.push_back(1); } mean = mean.view(stat_shape); rstd = rstd.view(stat_shape); return std::make_tuple(out, mean, rstd); } Tensor rms_norm( const Tensor& input, IntArrayRef normalized_shape, const std::optional& weight_opt /* optional */, std::optional eps) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); const Tensor& weight = *weight_maybe_owned; auto bias_opt = std::optional(); const Tensor& bias = *at::borrow_from_optional_tensor(bias_opt); (void) _check_layer_norm_inputs(input, normalized_shape, weight, bias); std::vector dims_to_reduce; for (const auto i : c10::irange(normalized_shape.size())) { dims_to_reduce.push_back(input.dim() - i - 1); } IntArrayRef dims_to_reduce_ref = IntArrayRef(dims_to_reduce); auto result = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "rms_norm", [&] { scalar_t eps_val; if (!eps.has_value()) { eps_val = std::numeric_limits::type>::epsilon(); } else { eps_val = eps.value(); } auto result = input.mul(at::rsqrt(at::pow(input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val))); if (weight_opt.has_value()) { result = result.mul(weight_opt.value()); } return result; }); return result; } } // namespace at::native