From 125cd1e8181a03bf07e07554560fb29d4bb1114a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?BENEDEK=20L=C3=A1szl=C3=B3?= Date: Tue, 22 Apr 2025 14:05:25 +0200 Subject: [PATCH] avx2 for float casting and scaling --- src/recorder/CMakeLists.txt | 1 + src/recorder/main.c | 105 +++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/src/recorder/CMakeLists.txt b/src/recorder/CMakeLists.txt index 4a921f7..9546f97 100644 --- a/src/recorder/CMakeLists.txt +++ b/src/recorder/CMakeLists.txt @@ -1,6 +1,7 @@ file(GLOB_RECURSE SOURCES "./*.c") add_executable(recorder ${SOURCES}) target_compile_options(recorder PRIVATE -Wall -Wextra) +target_compile_options(recorder PRIVATE -mavx2) target_link_libraries(recorder ps2000) set_target_properties(recorder PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" diff --git a/src/recorder/main.c b/src/recorder/main.c index f33e044..a4c5395 100644 --- a/src/recorder/main.c +++ b/src/recorder/main.c @@ -19,19 +19,118 @@ void capture_stop(int signal) { stop = true; } +#ifdef __AVX2__ +#include +__always_inline void to_float_iq(const int16_t* i_c, const int16_t* q_c, float* output, uint32_t len) { + size_t i = 0; + + __m256 scale = _mm256_set1_ps(1.0f / INT16_MAX); + for (; i + 16 <= len; i += 16) { + // load + __m256i i_vec = _mm256_loadu_si256((__m256i*)&i_c[i]); + __m256i q_vec = _mm256_loadu_si256((__m256i*)&q_c[i]); + + __m256i i_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(i_vec)); + __m256i i_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(i_vec, 1)); + __m256i q_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(q_vec)); + __m256i q_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(q_vec, 1)); + + // convert to float + __m256 i_float_lo = _mm256_cvtepi32_ps(i_lo); + __m256 i_float_hi = _mm256_cvtepi32_ps(i_hi); + __m256 q_float_lo = _mm256_cvtepi32_ps(q_lo); + __m256 q_float_hi = _mm256_cvtepi32_ps(q_hi); + + // scale + i_float_lo = _mm256_mul_ps(i_float_lo, scale); + i_float_hi = _mm256_mul_ps(i_float_hi, scale); + q_float_lo = _mm256_mul_ps(q_float_lo, scale); + q_float_hi = _mm256_mul_ps(q_float_hi, scale); + + // interleave and store + __m256 interleaved_lo0 = _mm256_unpacklo_ps(i_float_lo, q_float_hi); + __m256 interleaved_lo1 = _mm256_unpackhi_ps(i_float_lo, q_float_lo); + + __m256 interleaved_hi0 = _mm256_unpacklo_ps(i_float_hi, q_float_hi); + __m256 interleaved_hi1 = _mm256_unpackhi_ps(i_float_hi, q_float_hi); + + _mm256_storeu_ps(&output[i * 2 + 0], _mm256_permute2f128_ps(interleaved_lo0, interleaved_lo1, 0x20)); + _mm256_storeu_ps(&output[i * 2 + 8], _mm256_permute2f128_ps(interleaved_lo0, interleaved_lo1, 0x31)); + _mm256_storeu_ps(&output[i * 2 + 16], _mm256_permute2f128_ps(interleaved_hi0, interleaved_hi1, 0x20)); + _mm256_storeu_ps(&output[i * 2 + 24], _mm256_permute2f128_ps(interleaved_hi0, interleaved_hi1, 0x31)); + + // float i_output[16]; + // float q_output[16]; + + // _mm256_storeu_ps(&i_output[i], i_float_lo); + // _mm256_storeu_ps(&i_output[i + 8], i_float_hi); + // _mm256_storeu_ps(&q_output[i], q_float_lo); + // _mm256_storeu_ps(&q_output[i + 8], q_float_hi); + + // for (int j = 0; j < 16; j++) { + // output[(i + j) * 2 + 0] = i_output[j]; + // output[(i + j) * 2 + 1] = q_output[j]; + // } + } + + for (; i < len; ++i) { + output[i * 2 + 0] = i_c[i] / (float)INT16_MAX; + output[i * 2 + 1] = q_c[i] / (float)INT16_MAX; + } +} + +__always_inline void to_float(const int16_t* input, float* output, size_t len) { + size_t i = 0; + + __m256 scale = _mm256_set1_ps(1.0f / INT16_MAX); + for (; i + 16 <= len; i += 16) { + // load + __m256i input_vec = _mm256_loadu_si256((__m256i*)&input[i]); + + __m256i input_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(input_vec)); + __m256i input_hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(input_vec, 1)); + + // convert to float + __m256 float_lo = _mm256_cvtepi32_ps(input_lo); + __m256 float_hi = _mm256_cvtepi32_ps(input_hi); + + // scale + float_lo = _mm256_mul_ps(float_lo, scale); + float_hi = _mm256_mul_ps(float_hi, scale); + + // store + _mm256_storeu_ps(&output[i], float_lo); + _mm256_storeu_ps(&output[i + 8], float_hi); + } + + for (; i < len; i++) { + output[i] = input[i] / (float)INT16_MAX; + } +} +#endif + void get_overview_buffers(int16_t** overviewBuffers, int16_t overflow, uint32_t triggeredAt, int16_t triggered, int16_t auto_stop, uint32_t nValues) { (void)overflow; (void)triggeredAt; (void)triggered; (void)auto_stop; +#ifdef __AVX2__ + if (cfg->iq_mode) { + to_float_iq(overviewBuffers[0], overviewBuffers[2], buffer, nValues); + fwrite(buffer, sizeof(float), nValues * 2, stdout); + } else { + to_float(overviewBuffers[0], buffer, nValues); + fwrite(buffer, sizeof(float), nValues, stdout); + } + +#else if (cfg->iq_mode) { for (uint32_t i = 0; i < nValues; i++) { buffer[(i * 2) + 0] = overviewBuffers[0][i] / (float)INT16_MAX; buffer[(i * 2) + 1] = overviewBuffers[2][i] / (float)INT16_MAX; } - // fprintf(stderr, "%u\n", nValues); fwrite(buffer, sizeof(float), nValues * 2, stdout); } else { for (uint32_t i = 0; i < nValues; i++) { @@ -40,6 +139,7 @@ void get_overview_buffers(int16_t** overviewBuffers, int16_t overflow, uint32_t fwrite(buffer, sizeof(float), nValues, stdout); } +#endif } void cleanup(int16_t unit) { @@ -93,7 +193,8 @@ int main(int argc, char** argv) { buffer = (float*)malloc(sizeof(float) * (cfg->max_buffersize) * (cfg->iq_mode ? 2 : 1)); // start data capture without aggregation - fprintf(stderr, "sample_interval=%u, time_units=%u, buffer_size=%u, max_buffer_size=%u\n", cfg->sample_interval, cfg->time_units, cfg->buffersize, cfg->max_buffersize); + fprintf(stderr, "sample_interval=%u, time_units=%u, buffer_size=%u, max_buffer_size=%u\n", cfg->sample_interval, cfg->time_units, cfg->buffersize, + cfg->max_buffersize); if (ps2000_run_streaming_ns(unit, cfg->sample_interval, cfg->time_units, cfg->buffersize, false, 1, cfg->max_buffersize) == 0) { fprintf(stderr, "run_streaming_ns failed!\n"); cleanup(unit);