66 unsigned int num_points)
68 const unsigned int num_bytes = num_points * 2;
70 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
71 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
86 int bound = (num_bytes >> 4);
87 int leftovers = (num_bytes >> 1) & 7;
89 for (; i < bound; ++i) {
120 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
121 target0[i] = src0[i] + src1[i];
122 target1[i] = src0[i] + src2[i];
123 target2[i] = src0[i] + src3[i];
124 target3[i] = src0[i] + src4[i];
141 unsigned int num_points)
143 const unsigned int eighth_points = num_points / 8;
144 unsigned int number = 0;
146 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
147 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
148 for (number = 0; number < eighth_points; ++number) {
149 src0_vec = vld1q_s16(src0);
150 src1_vec = vld1q_s16(src1);
151 src2_vec = vld1q_s16(src2);
152 src3_vec = vld1q_s16(src3);
153 src4_vec = vld1q_s16(src4);
155 target0_vec = vaddq_s16(src0_vec, src1_vec);
156 target1_vec = vaddq_s16(src0_vec, src2_vec);
157 target2_vec = vaddq_s16(src0_vec, src3_vec);
158 target3_vec = vaddq_s16(src0_vec, src4_vec);
160 vst1q_s16(target0, target0_vec);
161 vst1q_s16(target1, target1_vec);
162 vst1q_s16(target2, target2_vec);
163 vst1q_s16(target3, target3_vec);
175 for (number = eighth_points * 8; number < num_points; ++number) {
176 *target0++ = *src0 + *src1++;
177 *target1++ = *src0 + *src2++;
178 *target2++ = *src0 + *src3++;
179 *target3++ = *src0++ + *src4++;
196 unsigned int num_points)
198 const unsigned int num_bytes = num_points * 2;
202 int bound = num_bytes >> 1;
204 for (i = 0; i < bound; ++i) {
205 target0[i] = src0[i] + src1[i];
206 target1[i] = src0[i] + src2[i];
207 target2[i] = src0[i] + src3[i];
208 target3[i] = src0[i] + src4[i];