• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use core::arch::aarch64::*;
2 
3 union UnionCast {
4     // u32x4: [u32; 4],
5     f32x4: [f32; 4],
6     v: float32x4_t,
7 }
8 
9 #[inline]
f32x4_from_array(f32x4: [f32; 4]) -> float32x4_t10 pub const fn f32x4_from_array(f32x4: [f32; 4]) -> float32x4_t {
11     unsafe { UnionCast { f32x4 }.v }
12 }
13 
14 // #[inline]
15 // pub(crate) unsafe fn dot3_in_x(lhs: float32x4_t, rhs: float32x4_t) -> float32x4_t {
16 //     let x2_y2_z2_w2 = vmulq_f32(lhs, rhs);
17 //     let y2 = vdupq_laneq_f32(x2_y2_z2_w2, 1);
18 //     let z2 = vdupq_laneq_f32(x2_y2_z2_w2, 2);
19 //     let x2y2 = vaddq_f32(x2_y2_z2_w2, y2);
20 //     vaddq_f32(x2y2, z2)
21 // }
22 
23 #[inline]
dot3(lhs: float32x4_t, rhs: float32x4_t) -> f3224 pub(crate) unsafe fn dot3(lhs: float32x4_t, rhs: float32x4_t) -> f32 {
25     let x2_y2_z2_w2 = vmulq_f32(lhs, rhs);
26     let x2_y2_z2 = vsetq_lane_f32(0.0, x2_y2_z2_w2, 3);
27     vaddvq_f32(x2_y2_z2)
28     // let dot = dot3_in_x(lhs, rhs);
29     // vdups_laneq_f32(dot, 0)
30 }
31 
32 #[inline]
dot3_into_f32x4(lhs: float32x4_t, rhs: float32x4_t) -> float32x4_t33 pub(crate) unsafe fn dot3_into_f32x4(lhs: float32x4_t, rhs: float32x4_t) -> float32x4_t {
34     let dot = dot3(lhs, rhs);
35     vld1q_dup_f32(&dot as *const f32)
36     // let dot = dot3_in_x(lhs, rhs);
37     // vdupq_laneq_f32(dot, 0)
38 }
39 
40 #[inline]
dot4(lhs: float32x4_t, rhs: float32x4_t) -> f3241 pub(crate) unsafe fn dot4(lhs: float32x4_t, rhs: float32x4_t) -> f32 {
42     let x2_y2_z2_w2 = vmulq_f32(lhs, rhs);
43     // TODO: horizontal add - might perform bad?
44     vaddvq_f32(x2_y2_z2_w2)
45 }
46 
47 #[inline]
dot4_into_f32x4(lhs: float32x4_t, rhs: float32x4_t) -> float32x4_t48 pub(crate) unsafe fn dot4_into_f32x4(lhs: float32x4_t, rhs: float32x4_t) -> float32x4_t {
49     let dot = dot4(lhs, rhs);
50     vld1q_dup_f32(&dot as *const f32)
51 }
52