Lines Matching full:reg
32 result.buf.reg[i] = LoadInt32x4(src.data(row, col + i));
46 result.buf.reg[2 * i + 0] = LoadInt32x4(src.data(row + 0, col + i));
47 result.buf.reg[2 * i + 1] = LoadInt32x4(src.data(row + 4, col + i));
64 result.buf.reg[0] = LoadInt32x4(buf);
80 result.buf.reg[0] = LoadInt32x4(buf);
81 result.buf.reg[1] = LoadInt32x4(buf + 4);
92 result.buf.reg[0] = LoadInt32x4(src.data(pos));
103 result.buf.reg[0] = LoadInt32x4(src(0));
120 result.buf.reg[0] = LoadInt32x4(src.data(pos));
137 result.buf.reg[0] = LoadInt32x4(src.data(pos));
138 result.buf.reg[1] = LoadInt32x4(src.data(pos + 4));
154 result.buf.reg[0] = src(pos);
171 result.buf.reg[0] = LoadInt32x4(src.data(pos));
188 result.buf.reg[0] = LoadInt32x4(src.data(pos));
189 result.buf.reg[1] = LoadInt32x4(src.data(pos + 4));
200 result.buf.reg[0] = Add(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
211 result.buf.reg[0] = Add(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
222 result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
233 result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
244 result.buf.reg[0] = Add(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
245 result.buf.reg[1] = Add(lhs.buf.reg[1], DupLane<1>(rhs.buf.reg[0]));
246 result.buf.reg[2] = Add(lhs.buf.reg[2], DupLane<2>(rhs.buf.reg[0]));
247 result.buf.reg[3] = Add(lhs.buf.reg[3], DupLane<3>(rhs.buf.reg[0]));
258 result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
259 result.buf.reg[1] = Add(lhs.buf.reg[1], rhs.buf.reg[0]);
260 result.buf.reg[2] = Add(lhs.buf.reg[2], rhs.buf.reg[0]);
261 result.buf.reg[3] = Add(lhs.buf.reg[3], rhs.buf.reg[0]);
272 const Int32x4 p = Dup<Int32x4>(rhs.buf.reg[0]);
274 result.buf.reg[i] = Add(lhs.buf.reg[i], p);
287 result.buf.reg[i] = Add(lhs.buf.reg[i], rhs.buf.reg[i]);
299 result.buf.reg[0] = Add(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
300 result.buf.reg[1] = Add(lhs.buf.reg[1], DupLane<0>(rhs.buf.reg[0]));
301 result.buf.reg[2] = Add(lhs.buf.reg[2], DupLane<1>(rhs.buf.reg[0]));
302 result.buf.reg[3] = Add(lhs.buf.reg[3], DupLane<1>(rhs.buf.reg[0]));
303 result.buf.reg[4] = Add(lhs.buf.reg[4], DupLane<2>(rhs.buf.reg[0]));
304 result.buf.reg[5] = Add(lhs.buf.reg[5], DupLane<2>(rhs.buf.reg[0]));
305 result.buf.reg[6] = Add(lhs.buf.reg[6], DupLane<3>(rhs.buf.reg[0]));
306 result.buf.reg[7] = Add(lhs.buf.reg[7], DupLane<3>(rhs.buf.reg[0]));
317 result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
318 result.buf.reg[1] = Add(lhs.buf.reg[1], rhs.buf.reg[1]);
319 result.buf.reg[2] = Add(lhs.buf.reg[2], rhs.buf.reg[0]);
320 result.buf.reg[3] = Add(lhs.buf.reg[3], rhs.buf.reg[1]);
321 result.buf.reg[4] = Add(lhs.buf.reg[4], rhs.buf.reg[0]);
322 result.buf.reg[5] = Add(lhs.buf.reg[5], rhs.buf.reg[1]);
323 result.buf.reg[6] = Add(lhs.buf.reg[6], rhs.buf.reg[0]);
324 result.buf.reg[7] = Add(lhs.buf.reg[7], rhs.buf.reg[1]);
335 result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
336 result.buf.reg[1] = Add(lhs.buf.reg[1], rhs.buf.reg[1]);
347 result.buf.reg[0] = Add(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
348 result.buf.reg[1] = Add(lhs.buf.reg[1], Dup<Int32x4>(rhs.buf.reg[0]));
360 result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
361 lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
373 result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
374 lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
386 result.buf.reg[0] =
387 SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
399 result.buf.reg[0] =
400 SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
412 result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
413 lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
414 result.buf.reg[1] = SaturatingRoundingDoublingHighMul(
415 lhs.buf.reg[1], DupLane<1>(rhs.buf.reg[0]));
416 result.buf.reg[2] = SaturatingRoundingDoublingHighMul(
417 lhs.buf.reg[2], DupLane<2>(rhs.buf.reg[0]));
418 result.buf.reg[3] = SaturatingRoundingDoublingHighMul(
419 lhs.buf.reg[3], DupLane<3>(rhs.buf.reg[0]));
431 result.buf.reg[0] =
432 SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
433 result.buf.reg[1] =
434 SaturatingRoundingDoublingHighMul(lhs.buf.reg[1], rhs.buf.reg[0]);
435 result.buf.reg[2] =
436 SaturatingRoundingDoublingHighMul(lhs.buf.reg[2], rhs.buf.reg[0]);
437 result.buf.reg[3] =
438 SaturatingRoundingDoublingHighMul(lhs.buf.reg[3], rhs.buf.reg[0]);
450 const Int32x4 p = Dup<Int32x4>(rhs.buf.reg[0]);
452 result.buf.reg[i] = SaturatingRoundingDoublingHighMul(lhs.buf.reg[i], p);
466 result.buf.reg[i] =
467 SaturatingRoundingDoublingHighMul(lhs.buf.reg[i], rhs.buf.reg[i]);
480 result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
481 lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
482 result.buf.reg[1] = SaturatingRoundingDoublingHighMul(
483 lhs.buf.reg[1], DupLane<0>(rhs.buf.reg[0]));
484 result.buf.reg[2] = SaturatingRoundingDoublingHighMul(
485 lhs.buf.reg[2], DupLane<1>(rhs.buf.reg[0]));
486 result.buf.reg[3] = SaturatingRoundingDoublingHighMul(
487 lhs.buf.reg[3], DupLane<1>(rhs.buf.reg[0]));
488 result.buf.reg[4] = SaturatingRoundingDoublingHighMul(
489 lhs.buf.reg[4], DupLane<2>(rhs.buf.reg[0]));
490 result.buf.reg[5] = SaturatingRoundingDoublingHighMul(
491 lhs.buf.reg[5], DupLane<2>(rhs.buf.reg[0]));
492 result.buf.reg[6] = SaturatingRoundingDoublingHighMul(
493 lhs.buf.reg[6], DupLane<3>(rhs.buf.reg[0]));
494 result.buf.reg[7] = SaturatingRoundingDoublingHighMul(
495 lhs.buf.reg[7], DupLane<3>(rhs.buf.reg[0]));
507 result.buf.reg[0] =
508 SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
509 result.buf.reg[1] =
510 SaturatingRoundingDoublingHighMul(lhs.buf.reg[1], rhs.buf.reg[1]);
511 result.buf.reg[2] =
512 SaturatingRoundingDoublingHighMul(lhs.buf.reg[2], rhs.buf.reg[0]);
513 result.buf.reg[3] =
514 SaturatingRoundingDoublingHighMul(lhs.buf.reg[3], rhs.buf.reg[1]);
515 result.buf.reg[4] =
516 SaturatingRoundingDoublingHighMul(lhs.buf.reg[4], rhs.buf.reg[0]);
517 result.buf.reg[5] =
518 SaturatingRoundingDoublingHighMul(lhs.buf.reg[5], rhs.buf.reg[1]);
519 result.buf.reg[6] =
520 SaturatingRoundingDoublingHighMul(lhs.buf.reg[6], rhs.buf.reg[0]);
521 result.buf.reg[7] =
522 SaturatingRoundingDoublingHighMul(lhs.buf.reg[7], rhs.buf.reg[1]);
534 result.buf.reg[0] =
535 SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
536 result.buf.reg[1] =
537 SaturatingRoundingDoublingHighMul(lhs.buf.reg[1], rhs.buf.reg[1]);
549 result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
550 lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
551 result.buf.reg[1] = SaturatingRoundingDoublingHighMul(
552 lhs.buf.reg[1], Dup<Int32x4>(rhs.buf.reg[0]));
563 result.buf.reg[0] = Mul(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
574 result.buf.reg[0] = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
585 result.buf.reg[0] = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
596 result.buf.reg[0] = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
607 const Int32x4 p = rhs.buf.reg[0];
608 result.buf.reg[0] = MulByRhsLane<0>(lhs.buf.reg[0], p);
609 result.buf.reg[1] = MulByRhsLane<1>(lhs.buf.reg[1], p);
610 result.buf.reg[2] = MulByRhsLane<2>(lhs.buf.reg[2], p);
611 result.buf.reg[3] = MulByRhsLane<3>(lhs.buf.reg[3], p);
622 const Int32x4 p = rhs.buf.reg[0];
623 result.buf.reg[0] = Mul(lhs.buf.reg[0], p);
624 result.buf.reg[1] = Mul(lhs.buf.reg[1], p);
625 result.buf.reg[2] = Mul(lhs.buf.reg[2], p);
626 result.buf.reg[3] = Mul(lhs.buf.reg[3], p);
637 const std::int32_t p = rhs.buf.reg[0];
639 result.buf.reg[i] = Mul(lhs.buf.reg[i], p);
652 result.buf.reg[i] = Mul(lhs.buf.reg[i], rhs.buf.reg[i]);
664 const Int32x4 p = rhs.buf.reg[0];
666 result.buf.reg[i + 0] = MulByRhsLane<0>(lhs.buf.reg[i + 0], p);
667 result.buf.reg[i + 2] = MulByRhsLane<1>(lhs.buf.reg[i + 2], p);
668 result.buf.reg[i + 4] = MulByRhsLane<2>(lhs.buf.reg[i + 4], p);
669 result.buf.reg[i + 6] = MulByRhsLane<3>(lhs.buf.reg[i + 6], p);
681 const Int32x4 p[2]{rhs.buf.reg[0], rhs.buf.reg[1]};
685 result.buf.reg[k] = Mul(lhs.buf.reg[k], p[j]);
698 const std::int32_t p = rhs.buf.reg[0];
700 MulAdd(lhs.buf.reg[i], p, &acc->buf.reg[i]);
712 const std::int32_t p = rhs.buf.reg[0];
715 const Int32x4 q = Mul(lhs.buf.reg[i], p);
717 acc->buf.reg[i + j * kRegsPerCol] =
718 Add(acc->buf.reg[i + j * kRegsPerCol], q);
730 const std::int32_t p = rhs.buf.reg[0];
732 MulAdd(lhs.buf.reg[i], p, &acc->buf.reg[i]);
744 const Int32x4 p = Dup<Int32x4>(Mul(lhs.buf.reg[0], rhs.buf.reg[0]));
746 acc->buf.reg[i] = Add(acc->buf.reg[i], p);
757 MulAdd(lhs.buf.reg[0], rhs.buf.reg[0], &acc->buf.reg[0]);
767 const Int32x4 p = rhs.buf.reg[0];
770 MulAddByRhsLane<0>(lhs.buf.reg[i], p, &acc->buf.reg[i + 0 * kRegsPerCol]);
771 MulAddByRhsLane<1>(lhs.buf.reg[i], p, &acc->buf.reg[i + 1 * kRegsPerCol]);
772 MulAddByRhsLane<2>(lhs.buf.reg[i], p, &acc->buf.reg[i + 2 * kRegsPerCol]);
773 MulAddByRhsLane<3>(lhs.buf.reg[i], p, &acc->buf.reg[i + 3 * kRegsPerCol]);
784 const Int32x4 p = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
793 acc->buf.reg[i + j * kRegsPerCol] =
794 Add(q[j], acc->buf.reg[i + j * kRegsPerCol]);
806 const Int32x4 p = Dup<Int32x4>(Mul(lhs.buf.reg[0], rhs.buf.reg[0]));
808 acc->buf.reg[i] = Add(acc->buf.reg[i], p);
819 const std::int32_t p = rhs.buf.reg[0];
820 MulAdd(lhs.buf.reg[0], p, &acc->buf.reg[0]);
830 const Int32x4 p = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
832 acc->buf.reg[i] = Add(p, acc->buf.reg[i]);
843 const std::int32_t p = rhs.buf.reg[0];
844 MulAdd(lhs.buf.reg[0], p, &acc->buf.reg[0]);