1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * icv_variance_sse42.c
24 *
25 * @brief
26 * This file contains the functions to compute variance
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 * icv_variance_8x4_ssse3()
33 *
34 * @remarks
35 * None
36 *
37 *******************************************************************************
38 */
39 /*****************************************************************************/
40 /* File Includes */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <stdint.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <assert.h>
48 #include <immintrin.h>
49
50 /* User include files */
51 #include "icv_datatypes.h"
52 #include "icv_macros.h"
53 #include "icv_platform_macros.h"
54 #include "icv.h"
55
56 /**
57 *******************************************************************************
58 *
59 * @brief
60 * Computes variance of a given 8x4 block
61 *
62 * @par Description
63 * Compute variance of a given 8x4 block
64 *
65 * @param[in] pu1_src
66 * Source
67 *
68 * @param[in] src_strd
69 * Source stride
70 *
71 * @param[in] wd
72 * Assumed to be 8
73 *
74 * @param[in] ht
75 * Assumed to be 4
76 *
77 * @returns
78 * Variance
79 *
80 * @remarks
81 *
82 *******************************************************************************
83 */
icv_variance_8x4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 wd,WORD32 ht)84 WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht)
85 {
86 WORD32 sum;
87 WORD32 sum_sqr;
88 WORD32 blk_sz;
89 WORD32 vrnc;
90 __m128 src_r0, src_r1;
91 __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3;
92 __m128i sum_r0, sum_r1;
93 __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3;
94 __m128i vsum, vsum_sqr;
95 __m128i zero;
96 UNUSED(wd);
97 UNUSED(ht);
98
99 ASSERT(wd == 8);
100 ASSERT(ht == 4);
101
102 sum = 0;
103 sum_sqr = 0;
104
105 blk_sz = 8 * 4;
106
107 zero = _mm_setzero_si128();
108
109 /* Load source */
110 src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
111 pu1_src += src_strd;
112
113 src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
114 pu1_src += src_strd;
115
116 src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
117 pu1_src += src_strd;
118
119 src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
120 pu1_src += src_strd;
121
122 /* Compute sum of all elements */
123 /* Use SAD with 0, since there is no pairwise addition */
124 sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero);
125 sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero);
126
127 /* Accumulate SAD */
128 vsum = _mm_add_epi64(sum_r0, sum_r1);
129 vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8));
130
131 sum = _mm_cvtsi128_si32(vsum);
132
133 /* Unpack to 16 bits */
134 ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero);
135 ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero);
136 ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero);
137 ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero);
138
139 /* Compute sum of squares */
140 sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0);
141 sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1);
142 sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2);
143 sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3);
144
145 vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1);
146 vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2);
147 vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3);
148
149 vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8));
150 vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4));
151 sum_sqr = _mm_cvtsi128_si32(vsum_sqr);
152
153 /* Compute variance */
154 vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz);
155
156 return vrnc;
157 }
158
159