67 unsigned int num_points)
70 float* res = (
float*)result;
71 float* in = (
float*)input;
72 float* tp = (
float*)taps;
73 unsigned int n_2_ccomplex_blocks = num_points / 2;
75 float sum0[2] = { 0, 0 };
76 float sum1[2] = { 0, 0 };
79 for (i = 0; i < n_2_ccomplex_blocks; ++i) {
80 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
81 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
82 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
83 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
89 res[0] = sum0[0] + sum1[0];
90 res[1] = sum0[1] + sum1[1];
94 *result += input[num_points - 1] * taps[num_points - 1];
108 unsigned int num_points)
112 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
114 unsigned int number = 0;
115 const unsigned int halfPoints = num_points / 2;
116 unsigned int isodd = num_points & 1;
118 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
125 for (; number < halfPoints; number++) {
154 dotProduct += (dotProductVector[0] + dotProductVector[1]);
157 dotProduct += input[num_points - 1] * taps[num_points - 1];
160 *result = dotProduct;
172 unsigned int num_points)
175 unsigned int isodd = num_points & 3;
178 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
180 unsigned int number = 0;
181 const unsigned int quarterPoints = num_points / 4;
183 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
188 dotProdVal = _mm256_setzero_ps();
190 for (; number < quarterPoints; number++) {
191 x = _mm256_loadu_ps((
float*)a);
192 y = _mm256_loadu_ps((
float*)b);
194 yl = _mm256_moveldup_ps(y);
195 yh = _mm256_movehdup_ps(y);
197 tmp1 = _mm256_mul_ps(x, yl);
199 x = _mm256_shuffle_ps(x, x, 0xB1);
201 tmp2 = _mm256_mul_ps(x, yh);
203 z = _mm256_addsub_ps(tmp1,
206 dotProdVal = _mm256_add_ps(dotProdVal,
215 _mm256_storeu_ps((
float*)dotProductVector,
218 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
219 dotProductVector[3]);
221 for (i = num_points - isodd; i < num_points; i++) {
222 dotProduct += input[i] * taps[i];
225 *result = dotProduct;
247 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
313 unsigned int num_points)
316 const unsigned int num_bytes = num_points * 8;
317 unsigned int isodd = num_points & 1;
320 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
322 unsigned int number = 0;
323 const unsigned int halfPoints = num_bytes >> 4;
325 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
332 for (; number < halfPoints; number++) {
361 dotProduct += (dotProductVector[0] + dotProductVector[1]);
364 dotProduct += input[num_points - 1] * taps[num_points - 1];
367 *result = dotProduct;
379 unsigned int num_points)
382 unsigned int quarter_points = num_points / 4;
389 float32x4x2_t a_val, b_val, c_val, accumulator;
390 float32x4x2_t tmp_real, tmp_imag;
391 accumulator.val[0] = vdupq_n_f32(0);
392 accumulator.val[1] = vdupq_n_f32(0);
394 for (number = 0; number < quarter_points; ++number) {
395 a_val = vld2q_f32((
float*)a_ptr);
396 b_val = vld2q_f32((
float*)b_ptr);
402 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
404 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
408 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
410 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
412 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
413 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
415 accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
416 accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
422 vst2q_f32((
float*)accum_result, accumulator);
423 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
426 for (number = quarter_points * 4; number < num_points; ++number) {
427 *result += (*a_ptr++) * (*b_ptr++);
437 unsigned int num_points)
440 unsigned int quarter_points = num_points / 4;
447 float32x4x2_t a_val, b_val, accumulator;
448 float32x4x2_t tmp_imag;
449 accumulator.val[0] = vdupq_n_f32(0);
450 accumulator.val[1] = vdupq_n_f32(0);
452 for (number = 0; number < quarter_points; ++number) {
453 a_val = vld2q_f32((
float*)a_ptr);
454 b_val = vld2q_f32((
float*)b_ptr);
459 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
460 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
463 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
464 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
466 accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
467 accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
474 vst2q_f32((
float*)accum_result, accumulator);
475 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
478 for (number = quarter_points * 4; number < num_points; ++number) {
479 *result += (*a_ptr++) * (*b_ptr++);
488 unsigned int num_points)
491 unsigned int quarter_points = num_points / 4;
498 float32x4x2_t a_val, b_val, accumulator1, accumulator2;
499 accumulator1.val[0] = vdupq_n_f32(0);
500 accumulator1.val[1] = vdupq_n_f32(0);
501 accumulator2.val[0] = vdupq_n_f32(0);
502 accumulator2.val[1] = vdupq_n_f32(0);
504 for (number = 0; number < quarter_points; ++number) {
505 a_val = vld2q_f32((
float*)a_ptr);
506 b_val = vld2q_f32((
float*)b_ptr);
511 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
512 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
513 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
514 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
519 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
520 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
522 vst2q_f32((
float*)accum_result, accumulator1);
523 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
526 for (number = quarter_points * 4; number < num_points; ++number) {
527 *result += (*a_ptr++) * (*b_ptr++);
536 unsigned int num_points)
541 unsigned int quarter_points = num_points / 8;
548 float32x4x4_t a_val, b_val, accumulator1, accumulator2;
549 float32x4x2_t reduced_accumulator;
550 accumulator1.val[0] = vdupq_n_f32(0);
551 accumulator1.val[1] = vdupq_n_f32(0);
552 accumulator1.val[2] = vdupq_n_f32(0);
553 accumulator1.val[3] = vdupq_n_f32(0);
554 accumulator2.val[0] = vdupq_n_f32(0);
555 accumulator2.val[1] = vdupq_n_f32(0);
556 accumulator2.val[2] = vdupq_n_f32(0);
557 accumulator2.val[3] = vdupq_n_f32(0);
560 for (number = 0; number < quarter_points; ++number) {
561 a_val = vld4q_f32((
float*)a_ptr);
562 b_val = vld4q_f32((
float*)b_ptr);
567 accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
568 accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
570 accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
571 accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
573 accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
574 accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
576 accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
577 accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
583 accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
584 accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
585 accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
586 accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
587 reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
588 reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
591 vst2q_f32((
float*)accum_result, reduced_accumulator);
592 *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
595 for (number = quarter_points * 8; number < num_points; ++number) {
596 *result += (*a_ptr++) * (*b_ptr++);
609 unsigned int num_points)
612 unsigned int isodd = num_points & 3;
615 memset(&dotProduct, 0x0, 2 *
sizeof(
float));
617 unsigned int number = 0;
618 const unsigned int quarterPoints = num_points / 4;
620 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
625 dotProdVal = _mm256_setzero_ps();
627 for (; number < quarterPoints; number++) {
629 x = _mm256_load_ps((
float*)a);
630 y = _mm256_load_ps((
float*)b);
632 yl = _mm256_moveldup_ps(y);
633 yh = _mm256_movehdup_ps(y);
635 tmp1 = _mm256_mul_ps(x, yl);
637 x = _mm256_shuffle_ps(x, x, 0xB1);
639 tmp2 = _mm256_mul_ps(x, yh);
641 z = _mm256_addsub_ps(tmp1,
644 dotProdVal = _mm256_add_ps(dotProdVal,
653 _mm256_store_ps((
float*)dotProductVector,
656 dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
657 dotProductVector[3]);
659 for (i = num_points - isodd; i < num_points; i++) {
660 dotProduct += input[i] * taps[i];
663 *result = dotProduct;
685 __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;