71 #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
72 #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
79 const float ala = fabsf(la);
80 const float alb = fabsf(lb);
81 return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
85 llr_odd_stages(
float* llrs,
int min_stage,
const int depth,
const int frame_size,
const int row)
87 int loop_stage = depth - 1;
90 int stage_size = 0x01 << loop_stage;
93 while(min_stage <= loop_stage){
94 dst_llr_ptr = llrs + loop_stage * frame_size + row;
95 src_llr_ptr = dst_llr_ptr + frame_size;
96 for(el = 0; el < stage_size; el++){
97 *dst_llr_ptr++ =
llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
107 llr_even(
const float la,
const float lb,
const unsigned char f)
118 even_u_values(
unsigned char* u_even,
const unsigned char* u,
const int u_num)
122 for(
i = 1;
i < u_num;
i += 2){
132 for(
i = 1;
i < u_num;
i += 2){
133 *u_xor++ = *u ^ *(u + 1);
141 int max_stage_depth = 0;
142 int half_stage_size = 0x01;
143 int stage_size = half_stage_size << 1;
144 while(max_stage_depth < (frame_exp - 1)){
145 if(!(row % stage_size < half_stage_size)){
148 half_stage_size <<= 1;
152 return max_stage_depth;
155 #ifdef LV_HAVE_GENERIC
160 const int stage,
const int u_num,
const int row)
162 const int frame_size = 0x01 << frame_exp;
163 const int next_stage = stage + 1;
165 const int half_stage_size = 0x01 << stage;
166 const int stage_size = half_stage_size << 1;
168 const bool is_upper_stage_half = row % stage_size < half_stage_size;
171 float* next_llrs = llrs + frame_size;
172 float* call_row_llr = llrs + row;
174 const int section = row - (row % stage_size);
175 const int jump_size = ((row % half_stage_size) << 1) % stage_size;
177 const int next_upper_row = section + jump_size;
178 const int next_lower_row = next_upper_row + 1;
180 const float* upper_right_llr_ptr = next_llrs + next_upper_row;
181 const float* lower_right_llr_ptr = next_llrs + next_lower_row;
183 if(!is_upper_stage_half){
184 const int u_pos = u_num >> stage;
185 const unsigned char f = u[u_pos - 1];
186 *call_row_llr =
llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
190 if(frame_exp > next_stage){
191 unsigned char* u_half = u + frame_size;
199 *call_row_llr =
llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
206 #include <immintrin.h>
212 const int stage,
const int u_num,
const int row)
214 const int frame_size = 0x01 << frame_exp;
216 const float* next_llrs = llrs + frame_size + row;
217 *(llrs + row) =
llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
222 if(max_stage_depth < 3){
227 int loop_stage = max_stage_depth;
228 int stage_size = 0x01 << loop_stage;
233 __m256 src0, src1, dst;
238 unsigned char* u_target = u + frame_size;
239 unsigned char* u_temp = u + 2* frame_size;
240 memcpy(u_temp, u + u_num - stage_size,
sizeof(
unsigned char) * stage_size);
250 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
251 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
256 for(p = 0; p < stage_size; p += 8){
258 fbits = _mm_loadu_si128((__m128i*) u_target);
261 src0 = _mm256_loadu_ps(src_llr_ptr);
262 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
267 _mm256_storeu_ps(dst_llr_ptr, dst);
275 const int min_stage = stage > 2 ? stage : 2;
280 while(min_stage < loop_stage){
281 dst_llr_ptr = llrs + loop_stage * frame_size + row;
282 src_llr_ptr = dst_llr_ptr + frame_size;
283 for(el = 0; el < stage_size; el += 8){
284 src0 = _mm256_loadu_ps(src_llr_ptr);
286 src1 = _mm256_loadu_ps(src_llr_ptr);
291 _mm256_storeu_ps(dst_llr_ptr, dst);
307 #include <immintrin.h>
311 volk_32f_8u_polarbutterfly_32f_u_avx2(
float* llrs,
unsigned char* u,
313 const int stage,
const int u_num,
const int row)
315 const int frame_size = 0x01 << frame_exp;
317 const float* next_llrs = llrs + frame_size + row;
318 *(llrs + row) =
llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
323 if(max_stage_depth < 3){
328 int loop_stage = max_stage_depth;
329 int stage_size = 0x01 << loop_stage;
334 __m256 src0, src1, dst;
339 unsigned char* u_target = u + frame_size;
340 unsigned char* u_temp = u + 2* frame_size;
341 memcpy(u_temp, u + u_num - stage_size,
sizeof(
unsigned char) * stage_size);
351 src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
352 dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
357 for(p = 0; p < stage_size; p += 8){
359 fbits = _mm_loadu_si128((__m128i*) u_target);
362 src0 = _mm256_loadu_ps(src_llr_ptr);
363 src1 = _mm256_loadu_ps(src_llr_ptr + 8);
368 _mm256_storeu_ps(dst_llr_ptr, dst);
376 const int min_stage = stage > 2 ? stage : 2;
381 while(min_stage < loop_stage){
382 dst_llr_ptr = llrs + loop_stage * frame_size + row;
383 src_llr_ptr = dst_llr_ptr + frame_size;
384 for(el = 0; el < stage_size; el += 8){
385 src0 = _mm256_loadu_ps(src_llr_ptr);
387 src1 = _mm256_loadu_ps(src_llr_ptr);
392 _mm256_storeu_ps(dst_llr_ptr, dst);