45#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
49 unsigned char t[64 / 8 ];
50 unsigned int w[64 / 32];
51 unsigned short s[64 / 16];
52 unsigned char c[64 / 8];
65 unsigned char min = X[0];
66 for (i = 0; i < NUMSTATES; i++)
69 for (i = 0; i < NUMSTATES; i++)
75static inline void BFLY(
int i,
81 unsigned char* Branchtab)
84 unsigned int decision0, decision1;
85 unsigned char metric, m0, m1, m2, m3;
86 unsigned short metricsum;
91 int PRECISIONSHIFT = 2;
94 for (j = 0; j < RATE; j++)
95 metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
96 metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
98 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
101 m1 = X[i + NUMSTATES / 2] + (max - metric);
102 m2 = X[i] + (max - metric);
103 m3 = X[i + NUMSTATES / 2] + metric;
105 decision0 = (
signed int)(m0 - m1) >= 0;
106 decision1 = (
signed int)(m2 - m3) >= 0;
108 Y[2 * i] = decision0 ? m1 : m0;
109 Y[2 * i + 1] = decision1 ? m3 : m2;
111 d->
w[i / (
sizeof(
unsigned int) * 8 / 2) +
112 s * (
sizeof(
decision_t) /
sizeof(
unsigned int))] |=
113 (decision0 | decision1 << 1) << ((2 * i) & (
sizeof(
unsigned int) * 8 - 1));
119#include <immintrin.h>
122static inline void volk_8u_x4_conv_k7_r2_8u_avx2(
unsigned char* Y,
126 unsigned int framebits,
128 unsigned char* Branchtab)
131 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
132 unsigned char a75, a81;
135 unsigned char *a80, *b6;
136 int *a110, *a91, *a93;
137 __m256i *a112, *a71, *a72, *a77, *a83, *a95;
139 __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26,
140 s18, s19, s22, s23, s24, s25, t13, t14, t15;
148 a76 = _mm256_set1_epi8(a75);
149 a77 = ((__m256i*)Branchtab);
151 a79 = _mm256_xor_si256(a76, a78);
154 a82 = _mm256_set1_epi8(a81);
157 a85 = _mm256_xor_si256(a82, a84);
158 t13 = _mm256_avg_epu8(a79, a85);
159 a86 = ((__m256i)t13);
160 a87 = _mm256_srli_epi16(a86, 2);
161 a88 = ((__m256i)a87);
162 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
163 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
164 m23 = _mm256_adds_epu8(s18, t14);
165 m24 = _mm256_adds_epu8(s19, t15);
166 m25 = _mm256_adds_epu8(s18, t15);
167 m26 = _mm256_adds_epu8(s19, t14);
168 a89 = _mm256_min_epu8(m24, m23);
169 d9 = _mm256_cmpeq_epi8(a89, m24);
170 a90 = _mm256_min_epu8(m26, m25);
171 d10 = _mm256_cmpeq_epi8(a90, m26);
172 s22 = _mm256_unpacklo_epi8(d9, d10);
173 s23 = _mm256_unpackhi_epi8(d9, d10);
174 s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
179 s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
182 s22 = _mm256_unpacklo_epi8(a89, a90);
183 s23 = _mm256_unpackhi_epi8(a89, a90);
185 s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
187 s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
192 m5 = ((__m256i*)Y)[0];
193 m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
194 m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
196 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
197 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
199 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
201 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
203 m7 = _mm256_unpacklo_epi8(m7, m7);
204 m7 = _mm256_shufflelo_epi16(m7, 0);
205 m6 = _mm256_unpacklo_epi64(m7, m7);
206 m6 = _mm256_permute2x128_si256(
209 ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
210 ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
212 unsigned char a188, a194;
215 unsigned char *a187, *a193;
216 int *a204, *a206, *a223, *b16;
217 __m256i *a184, *a185, *a190, *a196, *a208, *a225;
219 __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40,
220 m41, m42, s46, s47, s50, s51, t25, t26, t27;
221 a184 = ((__m256i*)Y);
227 a189 = _mm256_set1_epi8(a188);
228 a190 = ((__m256i*)Branchtab);
230 a192 = _mm256_xor_si256(a189, a191);
233 a195 = _mm256_set1_epi8(a194);
236 a198 = _mm256_xor_si256(a195, a197);
237 t25 = _mm256_avg_epu8(a192, a198);
238 a199 = ((__m256i)t25);
239 a200 = _mm256_srli_epi16(a199, 2);
240 a201 = ((__m256i)a200);
241 t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
242 t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
243 m39 = _mm256_adds_epu8(s46, t26);
244 m40 = _mm256_adds_epu8(s47, t27);
245 m41 = _mm256_adds_epu8(s46, t27);
246 m42 = _mm256_adds_epu8(s47, t26);
247 a202 = _mm256_min_epu8(m40, m39);
248 d17 = _mm256_cmpeq_epi8(a202, m40);
249 a203 = _mm256_min_epu8(m42, m41);
250 d18 = _mm256_cmpeq_epi8(a203, m42);
251 s24 = _mm256_unpacklo_epi8(d17, d18);
252 s25 = _mm256_unpackhi_epi8(d17, d18);
253 s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
259 s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
262 s50 = _mm256_unpacklo_epi8(a202, a203);
263 s51 = _mm256_unpackhi_epi8(a202, a203);
264 s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
265 s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
266 a208 = ((__m256i*)X);
272 m12 = ((__m256i*)X)[0];
273 m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
274 m12 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m12, m12, 0x21), m12));
276 m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
277 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
279 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
281 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
283 m14 = _mm256_unpacklo_epi8(m14, m14);
284 m14 = _mm256_shufflelo_epi16(m14, 0);
285 m13 = _mm256_unpacklo_epi64(m14, m14);
286 m13 = _mm256_permute2x128_si256(m13, m13, 0);
287 ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
288 ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
294 for (j = 0; j < (framebits + excess) % 2; ++j) {
296 for (i = 0; i < 64 / 2; i++) {
298 (((framebits + excess) >> 1) << 1) + j,
316#include <emmintrin.h>
318#include <pmmintrin.h>
320#include <xmmintrin.h>
326 unsigned int framebits,
328 unsigned char* Branchtab)
331 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
332 unsigned char a75, a81;
334 short int s20, s21, s26, s27;
335 unsigned char *a74, *a80, *b6;
336 short int *a110, *a111, *a91, *a93, *a94;
337 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
339 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
340 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
341 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
366 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
368 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
379 a91 = ((
short int*)dec);
408 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
410 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
451 unsigned char a188, a194;
453 short int s48, s49, s54, s55;
454 unsigned char *a187, *a193, *b15;
455 short int *a204, *a206, *a207, *a223, *a224, *b16;
456 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
458 __m128i a199, a200, a218, a219;
459 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
460 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
461 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
486 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
488 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
499 a204 = ((
short int*)dec);
529 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
531 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
582 for (j = 0; j < (framebits + excess) % 2; ++j) {
584 for (i = 0; i < 64 / 2; i++) {
586 (((framebits + excess) >> 1) << 1) + j,
616 unsigned int framebits,
618 unsigned char* Branchtab)
621 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
622 unsigned char a75, a81;
624 short int s20, s21, s26, s27;
625 unsigned char *a74, *a80, *b6;
626 short int *a110, *a111, *a91, *a93, *a94;
627 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
629 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
630 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
631 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
656 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
658 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
669 a91 = ((
short int*)dec);
698 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
700 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
741 unsigned char a188, a194;
743 short int s48, s49, s54, s55;
744 unsigned char *a187, *a193, *b15;
745 short int *a204, *a206, *a207, *a223, *a224, *b16;
746 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
748 __m128i a199, a200, a218, a219;
749 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
750 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
751 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
776 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
778 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
789 a204 = ((
short int*)dec);
819 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
821 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
872 for (j = 0; j < (framebits + excess) % 2; ++j) {
874 for (i = 0; i < 64 / 2; i++) {
876 (((framebits + excess) >> 1) << 1) + j,
904 unsigned int framebits,
906 unsigned char* Branchtab)
908 int nbits = framebits + excess;
912 for (s = 0; s < nbits; s++) {
914 for (i = 0; i < NUMSTATES / 2; i++) {
923 Y = (
unsigned char*)tmp;