avx2 for float casting and scaling

This commit is contained in:
BENEDEK László 2025-04-22 14:05:25 +02:00
parent d8164d120f
commit 125cd1e818
2 changed files with 104 additions and 2 deletions

View File

@ -1,6 +1,7 @@
file(GLOB_RECURSE SOURCES "./*.c")
add_executable(recorder ${SOURCES})
target_compile_options(recorder PRIVATE -Wall -Wextra)
target_compile_options(recorder PRIVATE -mavx2)
target_link_libraries(recorder ps2000)
set_target_properties(recorder PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"

View File

@ -19,19 +19,118 @@ void capture_stop(int signal) {
stop = true;
}
#ifdef __AVX2__
#include <immintrin.h>
__always_inline void to_float_iq(const int16_t* i_c, const int16_t* q_c, float* output, uint32_t len) {
size_t i = 0;
__m256 scale = _mm256_set1_ps(1.0f / INT16_MAX);
for (; i + 16 <= len; i += 16) {
// load
__m256i i_vec = _mm256_loadu_si256((__m256i*)&i_c[i]);
__m256i q_vec = _mm256_loadu_si256((__m256i*)&q_c[i]);
__m256i i_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(i_vec));
__m256i i_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(i_vec, 1));
__m256i q_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(q_vec));
__m256i q_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(q_vec, 1));
// convert to float
__m256 i_float_lo = _mm256_cvtepi32_ps(i_lo);
__m256 i_float_hi = _mm256_cvtepi32_ps(i_hi);
__m256 q_float_lo = _mm256_cvtepi32_ps(q_lo);
__m256 q_float_hi = _mm256_cvtepi32_ps(q_hi);
// scale
i_float_lo = _mm256_mul_ps(i_float_lo, scale);
i_float_hi = _mm256_mul_ps(i_float_hi, scale);
q_float_lo = _mm256_mul_ps(q_float_lo, scale);
q_float_hi = _mm256_mul_ps(q_float_hi, scale);
// interleave and store
__m256 interleaved_lo0 = _mm256_unpacklo_ps(i_float_lo, q_float_hi);
__m256 interleaved_lo1 = _mm256_unpackhi_ps(i_float_lo, q_float_lo);
__m256 interleaved_hi0 = _mm256_unpacklo_ps(i_float_hi, q_float_hi);
__m256 interleaved_hi1 = _mm256_unpackhi_ps(i_float_hi, q_float_hi);
_mm256_storeu_ps(&output[i * 2 + 0], _mm256_permute2f128_ps(interleaved_lo0, interleaved_lo1, 0x20));
_mm256_storeu_ps(&output[i * 2 + 8], _mm256_permute2f128_ps(interleaved_lo0, interleaved_lo1, 0x31));
_mm256_storeu_ps(&output[i * 2 + 16], _mm256_permute2f128_ps(interleaved_hi0, interleaved_hi1, 0x20));
_mm256_storeu_ps(&output[i * 2 + 24], _mm256_permute2f128_ps(interleaved_hi0, interleaved_hi1, 0x31));
// float i_output[16];
// float q_output[16];
// _mm256_storeu_ps(&i_output[i], i_float_lo);
// _mm256_storeu_ps(&i_output[i + 8], i_float_hi);
// _mm256_storeu_ps(&q_output[i], q_float_lo);
// _mm256_storeu_ps(&q_output[i + 8], q_float_hi);
// for (int j = 0; j < 16; j++) {
// output[(i + j) * 2 + 0] = i_output[j];
// output[(i + j) * 2 + 1] = q_output[j];
// }
}
for (; i < len; ++i) {
output[i * 2 + 0] = i_c[i] / (float)INT16_MAX;
output[i * 2 + 1] = q_c[i] / (float)INT16_MAX;
}
}
__always_inline void to_float(const int16_t* input, float* output, size_t len) {
size_t i = 0;
__m256 scale = _mm256_set1_ps(1.0f / INT16_MAX);
for (; i + 16 <= len; i += 16) {
// load
__m256i input_vec = _mm256_loadu_si256((__m256i*)&input[i]);
__m256i input_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(input_vec));
__m256i input_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(input_vec, 1));
// convert to float
__m256 float_lo = _mm256_cvtepi32_ps(input_lo);
__m256 float_hi = _mm256_cvtepi32_ps(input_hi);
// scale
float_lo = _mm256_mul_ps(float_lo, scale);
float_hi = _mm256_mul_ps(float_hi, scale);
// store
_mm256_storeu_ps(&output[i], float_lo);
_mm256_storeu_ps(&output[i + 8], float_hi);
}
for (; i < len; i++) {
output[i] = input[i] / (float)INT16_MAX;
}
}
#endif
void get_overview_buffers(int16_t** overviewBuffers, int16_t overflow, uint32_t triggeredAt, int16_t triggered, int16_t auto_stop, uint32_t nValues) {
(void)overflow;
(void)triggeredAt;
(void)triggered;
(void)auto_stop;
#ifdef __AVX2__
if (cfg->iq_mode) {
to_float_iq(overviewBuffers[0], overviewBuffers[2], buffer, nValues);
fwrite(buffer, sizeof(float), nValues * 2, stdout);
} else {
to_float(overviewBuffers[0], buffer, nValues);
fwrite(buffer, sizeof(float), nValues, stdout);
}
#else
if (cfg->iq_mode) {
for (uint32_t i = 0; i < nValues; i++) {
buffer[(i * 2) + 0] = overviewBuffers[0][i] / (float)INT16_MAX;
buffer[(i * 2) + 1] = overviewBuffers[2][i] / (float)INT16_MAX;
}
// fprintf(stderr, "%u\n", nValues);
fwrite(buffer, sizeof(float), nValues * 2, stdout);
} else {
for (uint32_t i = 0; i < nValues; i++) {
@ -40,6 +139,7 @@ void get_overview_buffers(int16_t** overviewBuffers, int16_t overflow, uint32_t
fwrite(buffer, sizeof(float), nValues, stdout);
}
#endif
}
void cleanup(int16_t unit) {
@ -93,7 +193,8 @@ int main(int argc, char** argv) {
buffer = (float*)malloc(sizeof(float) * (cfg->max_buffersize) * (cfg->iq_mode ? 2 : 1));
// start data capture without aggregation
fprintf(stderr, "sample_interval=%u, time_units=%u, buffer_size=%u, max_buffer_size=%u\n", cfg->sample_interval, cfg->time_units, cfg->buffersize, cfg->max_buffersize);
fprintf(stderr, "sample_interval=%u, time_units=%u, buffer_size=%u, max_buffer_size=%u\n", cfg->sample_interval, cfg->time_units, cfg->buffersize,
cfg->max_buffersize);
if (ps2000_run_streaming_ns(unit, cfg->sample_interval, cfg->time_units, cfg->buffersize, false, 1, cfg->max_buffersize) == 0) {
fprintf(stderr, "run_streaming_ns failed!\n");
cleanup(unit);