Revision | d0208b853f081da3bddca1d6bf83eb63559a50a4 (tree) |
---|---|
Time | 2020-09-27 19:52:37 |
Author | Starg <starg@user...> |
Commiter | Starg |
Merge branch 'dev41' into unicode
@@ -320,9 +320,34 @@ static FLAC__StreamDecoderWriteStatus flac_write_callback(const FLAC__StreamDeco | ||
320 | 320 | int32 *dest = (int32 *)sdr->data[i] + context->current_size_in_samples; |
321 | 321 | int n = frame->header.blocksize; |
322 | 322 | |
323 | - // This loop should be auto-vectorized. | |
324 | - for (int j = 0; j < n; j++) { | |
323 | + int j = 0; | |
324 | + | |
325 | +#if USE_X86_EXT_INTRIN >= 9 | |
326 | + int n8 = n & ~7; | |
327 | + __m128i vs = _mm_cvtsi32_si128(s); | |
328 | + | |
329 | + while (j < n8) { | |
330 | + __m256i v = _mm256_loadu_si256((const __m256i *)(src + j)); | |
331 | + v = _mm256_sll_epi32(v, vs); | |
332 | + _mm256_storeu_si256((__m256i *)(dest + j), v); | |
333 | + j += 8; | |
334 | + } | |
335 | + | |
336 | +#elif USE_X86_EXT_INTRIN >= 3 | |
337 | + int n4 = n & ~3; | |
338 | + __m128i vs = _mm_cvtsi32_si128(s); | |
339 | + | |
340 | + while (j < n4) { | |
341 | + __m128i v = _mm_loadu_si128((const __m128i *)(src + j)); | |
342 | + v = _mm_sll_epi32(v, vs); | |
343 | + _mm_storeu_si128((__m128i *)(dest + j), v); | |
344 | + j += 4; | |
345 | + } | |
346 | +#endif | |
347 | + | |
348 | + while (j < n) { | |
325 | 349 | dest[j] = src[j] << s; |
350 | + j++; | |
326 | 351 | } |
327 | 352 | } |
328 | 353 | break; |
@@ -336,7 +361,25 @@ static FLAC__StreamDecoderWriteStatus flac_write_callback(const FLAC__StreamDeco | ||
336 | 361 | |
337 | 362 | int j = 0; |
338 | 363 | |
339 | -#if USE_X86_EXT_INTRIN >= 3 | |
364 | +#if USE_X86_EXT_INTRIN >= 9 | |
365 | + int n16 = n & ~15; | |
366 | + __m128i vs = _mm_cvtsi32_si128(s); | |
367 | + | |
368 | + while (j < n16) { | |
369 | + __m256i v01 = _mm256_loadu_si256((const __m256i *)(src + j)); | |
370 | + __m256i v23 = _mm256_loadu_si256((const __m256i *)(src + j + 8)); | |
371 | + | |
372 | + __m256i v02 = _mm256_permute2x128_si256(v01, v23, (2 << 4) | 0); | |
373 | + __m256i v13 = _mm256_permute2x128_si256(v01, v23, (3 << 4) | 1); | |
374 | + | |
375 | + __m256i v = _mm256_packs_epi32(v02, v13); | |
376 | + v = _mm256_sll_epi16(v, vs); | |
377 | + _mm256_storeu_si256((__m256i *)(dest + j), v); | |
378 | + | |
379 | + j += 16; | |
380 | + } | |
381 | + | |
382 | +#elif USE_X86_EXT_INTRIN >= 3 | |
340 | 383 | int n8 = n & ~7; |
341 | 384 | __m128i vs = _mm_cvtsi32_si128(s); |
342 | 385 |