From 18 Jan, 2021 0:00 UTC: All services will be temporary unavailable for maintenance
  • R/O
  • HTTP
  • SSH
  • HTTPS

timidity41: Commit


Commit MetaInfo

Revision3617463359a06b97afc4f9877f6f32f9b31dadff (tree)
Time2020-09-27 19:51:34
AuthorStarg <starg@user...>
CommiterStarg

Log Message

[decode] Improve FLAC decoder

Change Summary

Incremental Difference

--- a/timidity/decode.c
+++ b/timidity/decode.c
@@ -320,9 +320,34 @@ static FLAC__StreamDecoderWriteStatus flac_write_callback(const FLAC__StreamDeco
320320 int32 *dest = (int32 *)sdr->data[i] + context->current_size_in_samples;
321321 int n = frame->header.blocksize;
322322
323- // This loop should be auto-vectorized.
324- for (int j = 0; j < n; j++) {
323+ int j = 0;
324+
325+#if USE_X86_EXT_INTRIN >= 9
326+ int n8 = n & ~7;
327+ __m128i vs = _mm_cvtsi32_si128(s);
328+
329+ while (j < n8) {
330+ __m256i v = _mm256_loadu_si256((const __m256i *)(src + j));
331+ v = _mm256_sll_epi32(v, vs);
332+ _mm256_storeu_si256((__m256i *)(dest + j), v);
333+ j += 8;
334+ }
335+
336+#elif USE_X86_EXT_INTRIN >= 3
337+ int n4 = n & ~3;
338+ __m128i vs = _mm_cvtsi32_si128(s);
339+
340+ while (j < n4) {
341+ __m128i v = _mm_loadu_si128((const __m128i *)(src + j));
342+ v = _mm_sll_epi32(v, vs);
343+ _mm_storeu_si128((__m128i *)(dest + j), v);
344+ j += 4;
345+ }
346+#endif
347+
348+ while (j < n) {
325349 dest[j] = src[j] << s;
350+ j++;
326351 }
327352 }
328353 break;
@@ -336,7 +361,25 @@ static FLAC__StreamDecoderWriteStatus flac_write_callback(const FLAC__StreamDeco
336361
337362 int j = 0;
338363
339-#if USE_X86_EXT_INTRIN >= 3
364+#if USE_X86_EXT_INTRIN >= 9
365+ int n16 = n & ~15;
366+ __m128i vs = _mm_cvtsi32_si128(s);
367+
368+ while (j < n16) {
369+ __m256i v01 = _mm256_loadu_si256((const __m256i *)(src + j));
370+ __m256i v23 = _mm256_loadu_si256((const __m256i *)(src + j + 8));
371+
372+ __m256i v02 = _mm256_permute2x128_si256(v01, v23, (2 << 4) | 0);
373+ __m256i v13 = _mm256_permute2x128_si256(v01, v23, (3 << 4) | 1);
374+
375+ __m256i v = _mm256_packs_epi32(v02, v13);
376+ v = _mm256_sll_epi16(v, vs);
377+ _mm256_storeu_si256((__m256i *)(dest + j), v);
378+
379+ j += 16;
380+ }
381+
382+#elif USE_X86_EXT_INTRIN >= 3
340383 int n8 = n & ~7;
341384 __m128i vs = _mm_cvtsi32_si128(s);
342385
Show on old repository browser