71 #ifndef INCLUDED_volk_32f_index_max_16u_a_H
72 #define INCLUDED_volk_32f_index_max_16u_a_H
81 #include <immintrin.h>
87 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
90 const uint32_t eighthPoints = num_points / 8;
92 float* inputPtr = (
float*)src0;
94 __m256 indexIncrementValues = _mm256_set1_ps(8);
95 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
99 __m256 maxValues = _mm256_set1_ps(max);
100 __m256 maxValuesIndex = _mm256_setzero_ps();
101 __m256 compareResults;
102 __m256 currentValues;
107 for(;number < eighthPoints; number++){
109 currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
110 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
112 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
114 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
115 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
119 _mm256_store_ps(maxValuesBuffer, maxValues);
120 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
122 for(number = 0; number < 8; number++){
123 if(maxValuesBuffer[number] > max){
124 index = maxIndexesBuffer[number];
125 max = maxValuesBuffer[number];
126 }
else if(maxValuesBuffer[number] == max){
127 if (index > maxIndexesBuffer[number])
128 index = maxIndexesBuffer[number];
132 number = eighthPoints * 8;
133 for(;number < num_points; number++){
134 if(src0[number] > max){
139 target[0] = (uint16_t)index;
144 #ifdef LV_HAVE_SSE4_1
145 #include <smmintrin.h>
148 volk_32f_index_max_16u_a_sse4_1(uint16_t* target,
const float* src0,
151 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
154 const uint32_t quarterPoints = num_points / 4;
156 float* inputPtr = (
float*)src0;
158 __m128 indexIncrementValues = _mm_set1_ps(4);
159 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
163 __m128 maxValues = _mm_set1_ps(max);
164 __m128 maxValuesIndex = _mm_setzero_ps();
165 __m128 compareResults;
166 __m128 currentValues;
171 for(;number < quarterPoints; number++){
173 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
174 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
176 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
178 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
179 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
183 _mm_store_ps(maxValuesBuffer, maxValues);
184 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
186 for(number = 0; number < 4; number++){
187 if(maxValuesBuffer[number] > max){
188 index = maxIndexesBuffer[number];
189 max = maxValuesBuffer[number];
190 }
else if(maxValuesBuffer[number] == max){
191 if (index > maxIndexesBuffer[number])
192 index = maxIndexesBuffer[number];
196 number = quarterPoints * 4;
197 for(;number < num_points; number++){
198 if(src0[number] > max){
203 target[0] = (uint16_t)index;
211 #include <xmmintrin.h>
217 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
220 const uint32_t quarterPoints = num_points / 4;
222 float* inputPtr = (
float*)src0;
224 __m128 indexIncrementValues = _mm_set1_ps(4);
225 __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
229 __m128 maxValues = _mm_set1_ps(max);
230 __m128 maxValuesIndex = _mm_setzero_ps();
231 __m128 compareResults;
232 __m128 currentValues;
237 for(;number < quarterPoints; number++){
239 currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
240 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
242 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
244 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
245 _mm_andnot_ps(compareResults, maxValuesIndex));
246 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
247 _mm_andnot_ps(compareResults, maxValues));
251 _mm_store_ps(maxValuesBuffer, maxValues);
252 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
254 for(number = 0; number < 4; number++){
255 if(maxValuesBuffer[number] > max){
256 index = maxIndexesBuffer[number];
257 max = maxValuesBuffer[number];
258 }
else if(maxValuesBuffer[number] == max){
259 if (index > maxIndexesBuffer[number])
260 index = maxIndexesBuffer[number];
264 number = quarterPoints * 4;
265 for(;number < num_points; number++){
266 if(src0[number] > max){
271 target[0] = (uint16_t)index;
277 #ifdef LV_HAVE_GENERIC
283 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
290 for(;
i < num_points; ++
i) {
306 #ifndef INCLUDED_volk_32f_index_max_16u_u_H
307 #define INCLUDED_volk_32f_index_max_16u_u_H
311 #include <inttypes.h>
316 #include <immintrin.h>
322 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
325 const uint32_t eighthPoints = num_points / 8;
327 float* inputPtr = (
float*)src0;
329 __m256 indexIncrementValues = _mm256_set1_ps(8);
330 __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
334 __m256 maxValues = _mm256_set1_ps(max);
335 __m256 maxValuesIndex = _mm256_setzero_ps();
336 __m256 compareResults;
337 __m256 currentValues;
342 for(;number < eighthPoints; number++){
344 currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
345 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
347 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
349 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
350 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
354 _mm256_storeu_ps(maxValuesBuffer, maxValues);
355 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
357 for(number = 0; number < 8; number++){
358 if(maxValuesBuffer[number] > max){
359 index = maxIndexesBuffer[number];
360 max = maxValuesBuffer[number];
361 }
else if(maxValuesBuffer[number] == max){
362 if (index > maxIndexesBuffer[number])
363 index = maxIndexesBuffer[number];
367 number = eighthPoints * 8;
368 for(;number < num_points; number++){
369 if(src0[number] > max){
374 target[0] = (uint16_t)index;