• R/O
  • HTTP
  • SSH
  • HTTPS

timidity41: Commit


Commit MetaInfo

Revision784c7e958188181428b4c1b3b682ff9255695758 (tree)
Time2020-04-20 22:57:02
AuthorStarg <starg@user...>
CommiterStarg

Log Message

Merge branch 'dev41' into unicode

Change Summary

Incremental Difference

--- a/timidity/effect.c
+++ b/timidity/effect.c
@@ -2506,23 +2506,9 @@ static void do_drive_mono(Drive *drv, DATA_T *inout)
25062506 int32 index;
25072507 FLOAT_T in, sign, v1, v2, fp;
25082508
2509- static FLOAT_T max = 0, avg = 0, sum = 0;
2510- static int32 tc = 0;
2511-
2512-
2513-
2514-
25152509 in = *inout;
25162510 sign = (in < 0) ? (-1.0) : (1.0);
25172511 in *= drv->cnv * sign;
2518-
2519- if(in > 1 && in > max)
2520- max = in;
2521- ++tc;
2522- sum += in;
2523- avg = sum / (FLOAT_T)tc;
2524-
2525-
25262512 fp = floor(in);
25272513 index = fp;
25282514 fp = in - fp;
--- a/timidity/effect.h
+++ b/timidity/effect.h
@@ -505,9 +505,9 @@ int32
505505 in: 0.0 ~ 8.0 (1.0: 1<<(DRIVE_INPUT_BIT) , DRIVE_SCALE_BIT+DRIVE_BASE_BIT+FRACTION_BITS < 30bit
506506 out: 0.0 ~ 8.0 * clip_level
507507 */
508-#define DRIVE_SCALE_BIT (3) // 1.0 * 2^MATH_SCALE_BIT
508+#define DRIVE_SCALE_BIT (2) // 1.0 * 2^MATH_SCALE_BIT
509509 #define DRIVE_SCALE_MAX (1 << DRIVE_SCALE_BIT) // table max 1.0 * MATH_SCALE_MAX
510-#define DRIVE_BASE_BIT (6) // 0.0~1.0 table size
510+#define DRIVE_BASE_BIT (8) // 0.0~1.0 table size
511511 #define DRIVE_BASE_LENGTH (1 << (DRIVE_BASE_BIT)) // 0.0~1.0:table size
512512 #define DRIVE_TABLE_LENGTH (1 << (DRIVE_BASE_BIT + DRIVE_SCALE_BIT)) // 0.0~1.0 * MATH_SCALE_MAX table size
513513 #define DRIVE_FRAC_BIT (14) // for int32
--- a/timidity/thread_effect.c
+++ b/timidity/thread_effect.c
@@ -1970,6 +1970,8 @@ void init_effect_buffer_thread(void)
19701970 memset(delay_effect_buffer_sub, 0, sizeof(delay_effect_buffer_sub));
19711971 memset(reverb_effect_buffer_sub, 0, sizeof(reverb_effect_buffer_sub));
19721972
1973+ memset(master_effect_buffer_thread, 0, sizeof(master_effect_buffer_thread));
1974+
19731975 reset_effect_thread_var();
19741976 }
19751977
--- a/timidity/voice_effect.c
+++ b/timidity/voice_effect.c
@@ -594,66 +594,7 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
594594 }
595595 break;
596596 case 1:
597-#if 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
598- {
599- const int32 req_count_mask = ~(0x7);
600- int32 count2 = count & req_count_mask;
601- __m256 vgain = _mm256_set1_ps((float)info->velgain);
602- __m256 vlevel = _mm256_set1_ps((float)info->level);
603- const __m256 vvp1 = _mm256_set1_ps(1.0);
604- const __m256 vvn1 = _mm256_set1_ps(-1.0);
605- const __m256 vvq = _mm256_set1_ps(0.25);
606- for (i = 0; i < count2; i += 8) {
607- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
608- vtmp1 = MM256_SET2X_PS(
609- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i])),
610- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4])) );
611- vtmp1 = _mm256_mul_ps(vtmp1, vgain); _mm256_cmp_ps(
612- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
613- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
614- vsign1 = _mm256_or_ps(vsp, vsn);
615- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
616- vbase1 = _mm256_and_ps(vtmp1, vme);
617- vtmp1 = _mm256_mul_ps(vtmp1, vsign1)
618- vtmp1 = _mm256_sub_ps(vtmp1, vvp1);
619- vtmp1 = _mm256_mul_ps(vtmp1, vvq);
620- vtmp1 = _mm256_add_ps(vtmp1, vvp1);
621- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
622- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
623- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
624- _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x0)));
625- _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x1)));
626- }
627- }
628-#elif 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_FLOAT)
629- {
630- const int32 req_count_mask = ~(0x7);
631- int32 count2 = count & req_count_mask;
632- __m256 vgain = _mm256_set1_ps((float)info->velgain);
633- __m256 vlevel = _mm256_set1_ps((float)info->level);
634- const __m256 vvp1 = _mm256_set1_ps(1.0);
635- const __m256 vvn1 = _mm256_set1_ps(-1.0);
636- const __m256 vvq = _mm256_set1_ps(0.25);
637- for (i = 0; i < count2; i += 8) {
638- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
639- vtmp1 = _mm256_loadu_ps(&sp[i]);
640- vtmp1 = _mm256_mul_ps(vtmp1, vgain);
641- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
642- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
643- vsign1 = _mm256_or_ps(vsp, vsn);
644- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
645- vbase1 = _mm256_and_ps(vtmp1, vme);
646- vtmp1 = _mm256_mul_ps(vtmp1, vsign1)
647- vtmp1 = _mm256_sub_ps(vtmp1, vvp1);
648- vtmp1 = _mm256_mul_ps(vtmp1, vvq);
649- vtmp1 = _mm256_add_ps(vtmp1, vvp1);
650- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
651- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
652- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
653- _mm256_storeu_ps(&sp[i], vtmp1);
654- }
655- }
656-#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
597+#if (USE_X86_EXT_INTRIN >= 2) && (defined(DATA_T_DOUBLE) || defined(DATA_T_FLOAT))
657598 {
658599 const int32 req_count_mask = ~(0x7);
659600 int32 count2 = count & req_count_mask;
@@ -663,46 +604,85 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
663604 const __m128 vvn1 = _mm_set1_ps(-1.0);
664605 const __m128 vvq = _mm_set1_ps(0.25);
665606 for (i = 0; i < count2; i += 8) {
607+/*
608+tmp = sp[i] * info->velgain;
609+sp = tmp >= 0 ? (1.0) : (0.0);
610+sn = tmp < 0 ? (-1.0) : (0.0);
611+sign = sp | sn;
612+base = tmp = tmp * sign;
613+tmp = 1.0 + (tmp - 1.0) * 0.25;
614+sp = base > 1.0 ? tmp : 0;
615+sn = base <= 1.0 ? base : 0;
616+tmp = sp | sn;
617+sp[i] = tmp * sign * info->level;
618+*/
666619 __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
620+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
621+ vtmp1 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i]));
622+ vtmp2 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4]));
623+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
667624 vtmp1 = _mm_shuffle_ps(
668625 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i])),
669626 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 2])), 0x44);
670627 vtmp2 = _mm_shuffle_ps(
671628 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 4])),
672629 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 6])), 0x44);
630+#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
631+ vtmp1 = _mm_loadu_ps(&sp[i]);
632+ vtmp2 = _mm_loadu_ps(&sp[i + 4]);
633+#endif
673634 vtmp1 = _mm_mul_ps(vtmp1, vgain);
674- vtmp2 = _mm_mul_ps(vtmp2, vgain);
675- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
676- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
677- vsign1 = _mm_or_ps(vsp, vsn);
678- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
679- vbase1 = _mm_and_ps(vtmp1, vme);
680- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
681- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
682- vsign2 = _mm_or_ps(vsp, vsn);
683- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
684- vbase2 = _mm_and_ps(vtmp2, vme);
685- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
686- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
635+ vtmp2 = _mm_mul_ps(vtmp2, vgain);
636+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp1, _mm_setzero_ps()));
637+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, _mm_setzero_ps()));
638+ vsign1 = _mm_or_ps(vsp, vsn);
639+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp2, _mm_setzero_ps()));
640+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, _mm_setzero_ps()));
641+ vsign2 = _mm_or_ps(vsp, vsn);
642+ vbase1 = vtmp1 = _mm_mul_ps(vtmp1, vsign1);
643+ vbase2 = vtmp2 = _mm_mul_ps(vtmp2, vsign2);
687644 vtmp1 = _mm_sub_ps(vtmp1, vvp1);
688645 vtmp2 = _mm_sub_ps(vtmp2, vvp1);
689646 vtmp1 = _mm_mul_ps(vtmp1, vvq);
690647 vtmp2 = _mm_mul_ps(vtmp2, vvq);
691648 vtmp1 = _mm_add_ps(vtmp1, vvp1);
692- vtmp2 = _mm_add_ps(vtmp2, vvp1);
649+ vtmp2 = _mm_add_ps(vtmp2, vvp1);
650+ vsp = _mm_and_ps(vtmp1, _mm_cmpgt_ps(vbase1, vvp1));
651+ vsn = _mm_and_ps(vbase1, _mm_cmple_ps(vbase1, vvp1));
652+ vtmp1 = _mm_or_ps(vsp, vsn);
653+ vsp = _mm_and_ps(vtmp2, _mm_cmpgt_ps(vbase2, vvp1));
654+ vsn = _mm_and_ps(vbase2, _mm_cmple_ps(vbase2, vvp1));
655+ vtmp2 = _mm_or_ps(vsp, vsn);
693656 vtmp1 = _mm_mul_ps(vtmp1, vsign1);
694657 vtmp2 = _mm_mul_ps(vtmp2, vsign2);
695- vtmp1 = _mm_add_ps(vtmp1, vbase1);
696- vtmp2 = _mm_add_ps(vtmp2, vbase2);
697658 vtmp1 = _mm_mul_ps(vtmp1, vlevel);
698659 vtmp2 = _mm_mul_ps(vtmp2, vlevel);
660+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
661+ _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(vtmp1));
662+ _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(vtmp2));
663+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
699664 _mm_storeu_pd(&sp[i], _mm_cvtps_pd(vtmp1));
700665 _mm_storeu_pd(&sp[i + 2], _mm_cvtps_pd(_mm_movehl_ps(vtmp1,vtmp1)));
701666 _mm_storeu_pd(&sp[i + 4], _mm_cvtps_pd(vtmp2));
702667 _mm_storeu_pd(&sp[i + 6], _mm_cvtps_pd(_mm_movehl_ps(vtmp2,vtmp2)));
668+#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
669+ _mm_storeu_ps(&sp[i], vtmp1);
670+ _mm_storeu_ps(&sp[i + 4], vtmp2);
671+#endif
703672 }
704673 }
705-#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
674+#endif // USE_X86_EXT_INTRIN
675+ for (; i < count; i++) {
676+ FLOAT_T tmp = sp[i] * info->velgain;
677+ if(tmp > 1.0)
678+ tmp = 1.0 + (tmp - 1.0) * 0.25;
679+ else if(tmp < -1.0)
680+ tmp = -1.0 - (tmp + 1.0) * 0.25;
681+ sp[i] = tmp * info->level;
682+ }
683+ break;
684+ case 2:
685+#if (USE_X86_EXT_INTRIN >= 2) && (defined(DATA_T_DOUBLE) || defined(DATA_T_FLOAT))
706686 {
707687 const int32 req_count_mask = ~(0x7);
708688 int32 count2 = count & req_count_mask;
@@ -712,37 +692,71 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
712692 const __m128 vvn1 = _mm_set1_ps(-1.0);
713693 const __m128 vvq = _mm_set1_ps(0.25);
714694 for (i = 0; i < count2; i += 8) {
695+/*
696+tmp = sp[i] * info->velgain;
697+sp = tmp >= 0 ? (1.0) : (0.0);
698+sn = tmp < 0 ? (-1.0) : (0.0);
699+sign = sp | sn;
700+base = tmp = tmp * sign;
701+tmp = 1.0 - (tmp - 1.0) * 0.25;
702+sp = base > 1.0 ? tmp : 0;
703+sn = base <= 1.0 ? base : 0;
704+tmp = sp | sn;
705+sp[i] = tmp * sign * info->level;
706+*/
715707 __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
708+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
709+ vtmp1 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i]));
710+ vtmp2 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4]));
711+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
712+ vtmp1 = _mm_shuffle_ps(
713+ _mm_cvtpd_ps(_mm_loadu_pd(&sp[i])),
714+ _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 2])), 0x44);
715+ vtmp2 = _mm_shuffle_ps(
716+ _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 4])),
717+ _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 6])), 0x44);
718+#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
716719 vtmp1 = _mm_loadu_ps(&sp[i]);
717720 vtmp2 = _mm_loadu_ps(&sp[i + 4]);
721+#endif
718722 vtmp1 = _mm_mul_ps(vtmp1, vgain);
719- vtmp2 = _mm_mul_ps(vtmp2, vgain);
720- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
721- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
722- vsign1 = _mm_or_ps(vsp, vsn);
723- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
724- vbase1 = _mm_and_ps(vtmp1, vme);
725- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
726- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
727- vsign2 = _mm_or_ps(vsp, vsn);
728- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
729- vbase2 = _mm_and_ps(vtmp2, vme);
730- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
731- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
723+ vtmp2 = _mm_mul_ps(vtmp2, vgain);
724+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp1, _mm_setzero_ps()));
725+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, _mm_setzero_ps()));
726+ vsign1 = _mm_or_ps(vsp, vsn);
727+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp2, _mm_setzero_ps()));
728+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, _mm_setzero_ps()));
729+ vsign2 = _mm_or_ps(vsp, vsn);
730+ vbase1 = vtmp1 = _mm_mul_ps(vtmp1, vsign1);
731+ vbase2 = vtmp2 = _mm_mul_ps(vtmp2, vsign2);
732732 vtmp1 = _mm_sub_ps(vtmp1, vvp1);
733733 vtmp2 = _mm_sub_ps(vtmp2, vvp1);
734734 vtmp1 = _mm_mul_ps(vtmp1, vvq);
735735 vtmp2 = _mm_mul_ps(vtmp2, vvq);
736- vtmp1 = _mm_add_ps(vtmp1, vvp1);
737- vtmp2 = _mm_add_ps(vtmp2, vvp1);
736+ vtmp1 = _mm_sub_ps(vvp1, vtmp1);
737+ vtmp2 = _mm_sub_ps(vvp1, vtmp2);
738+ vsp = _mm_and_ps(vtmp1, _mm_cmpgt_ps(vbase1, vvp1));
739+ vsn = _mm_and_ps(vbase1, _mm_cmple_ps(vbase1, vvp1));
740+ vtmp1 = _mm_or_ps(vsp, vsn);
741+ vsp = _mm_and_ps(vtmp2, _mm_cmpgt_ps(vbase2, vvp1));
742+ vsn = _mm_and_ps(vbase2, _mm_cmple_ps(vbase2, vvp1));
743+ vtmp2 = _mm_or_ps(vsp, vsn);
738744 vtmp1 = _mm_mul_ps(vtmp1, vsign1);
739745 vtmp2 = _mm_mul_ps(vtmp2, vsign2);
740- vtmp1 = _mm_add_ps(vtmp1, vbase1);
741- vtmp2 = _mm_add_ps(vtmp2, vbase2);
742746 vtmp1 = _mm_mul_ps(vtmp1, vlevel);
743747 vtmp2 = _mm_mul_ps(vtmp2, vlevel);
748+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
749+ _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(vtmp1));
750+ _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(vtmp2));
751+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
752+ _mm_storeu_pd(&sp[i], _mm_cvtps_pd(vtmp1));
753+ _mm_storeu_pd(&sp[i + 2], _mm_cvtps_pd(_mm_movehl_ps(vtmp1,vtmp1)));
754+ _mm_storeu_pd(&sp[i + 4], _mm_cvtps_pd(vtmp2));
755+ _mm_storeu_pd(&sp[i + 6], _mm_cvtps_pd(_mm_movehl_ps(vtmp2,vtmp2)));
756+#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
744757 _mm_storeu_ps(&sp[i], vtmp1);
745758 _mm_storeu_ps(&sp[i + 4], vtmp2);
759+#endif
746760 }
747761 }
748762 #endif // USE_X86_EXT_INTRIN
@@ -755,61 +769,8 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
755769 sp[i] = tmp * info->level;
756770 }
757771 break;
758- case 2:
759-#if 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
760- {
761- const int32 req_count_mask = ~(0x7);
762- int32 count2 = count & req_count_mask;
763- __m256 vgain = _mm256_set1_ps((float)info->velgain);
764- __m256 vlevel = _mm256_set1_ps((float)info->level);
765- const __m256 vvp1 = _mm256_set1_ps(1.0);
766- const __m256 vvn1 = _mm256_set1_ps(-1.0);
767- for (i = 0; i < count2; i += 8) {
768- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
769- vtmp1 = MM256_SET2X_PS(
770- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i])),
771- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4])) );
772- vtmp1 = _mm256_mul_ps(vtmp1, vgain);
773- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
774- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
775- vsign1 = _mm256_or_ps(vsp, vsn);
776- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
777- vbase1 = _mm256_and_ps(vtmp1, vme);
778- vtmp1 = _mm256_mul_ps(vtmp1, vsign1)
779- vtmp1 = _mm256_sqrt_ps(vtmp1);
780- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
781- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
782- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
783- _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x0)));
784- _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x1)));
785- }
786- }
787-#elif 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_FLOAT)
788- {
789- const int32 req_count_mask = ~(0x7);
790- int32 count2 = count & req_count_mask;
791- __m256 vgain = _mm256_set1_ps((float)info->velgain);
792- __m256 vlevel = _mm256_set1_ps((float)info->level);
793- const __m256 vvp1 = _mm256_set1_ps(1.0);
794- const __m256 vvn1 = _mm256_set1_ps(-1.0);
795- for (i = 0; i < count2; i += 8) {
796- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
797- vtmp1 = _mm256_loadu_ps(&sp[i]);
798- vtmp1 = _mm256_mul_ps(vtmp1, vgain);
799- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
800- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
801- vsign1 = _mm256_or_ps(vsp, vsn);
802- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
803- vbase1 = _mm256_and_ps(vtmp1, vme);
804- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
805- vtmp1 = _mm256_sqrt_ps(vtmp1);
806- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
807- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
808- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
809- _mm256_storeu_ps(&sp[i], vtmp1);
810- }
811- }
812-#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
772+ case 3:
773+#if (USE_X86_EXT_INTRIN >= 2) && (defined(DATA_T_DOUBLE) || defined(DATA_T_FLOAT))
813774 {
814775 const int32 req_count_mask = ~(0x7);
815776 int32 count2 = count & req_count_mask;
@@ -818,78 +779,67 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
818779 const __m128 vvp1 = _mm_set1_ps(1.0);
819780 const __m128 vvn1 = _mm_set1_ps(-1.0);
820781 for (i = 0; i < count2; i += 8) {
782+/*
783+tmp = sp[i] * info->velgain;
784+sp = tmp >= 0 ? (1.0) : 0;
785+sn = tmp < 0 ? (-1.0) : 0;
786+sign = sp | sn;
787+base = tmp = tmp * sign;
788+tmp = sqrt(tmp);
789+sp = base > 1.0 ? tmp : 0;
790+sn = base <= 1.0 ? base : 0;
791+tmp = sp | sn;
792+sp[i] = tmp * sign * info->level;
793+*/
821794 __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
822- __m128d vtmpd1, vtmpd2;
795+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
796+ vtmp1 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i]));
797+ vtmp2 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4]));
798+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
823799 vtmp1 = _mm_shuffle_ps(
824800 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i])),
825801 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 2])), 0x44);
826802 vtmp2 = _mm_shuffle_ps(
827803 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 4])),
828804 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 6])), 0x44);
805+#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
806+ vtmp1 = _mm_loadu_ps(&sp[i]);
807+ vtmp2 = _mm_loadu_ps(&sp[i + 4]);
808+#endif
829809 vtmp1 = _mm_mul_ps(vtmp1, vgain);
830- vtmp2 = _mm_mul_ps(vtmp2, vgain);
831- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
832- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
833- vsign1 = _mm_or_ps(vsp, vsn);
834- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
835- vbase1 = _mm_and_ps(vtmp1, vme);
836- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
837- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
838- vsign2 = _mm_or_ps(vsp, vsn);
839- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
840- vbase2 = _mm_and_ps(vtmp2, vme);
841- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
842- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
810+ vtmp2 = _mm_mul_ps(vtmp2, vgain);
811+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp1, _mm_setzero_ps()));
812+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, _mm_setzero_ps()));
813+ vsign1 = _mm_or_ps(vsp, vsn);
814+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp2, _mm_setzero_ps()));
815+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, _mm_setzero_ps()));
816+ vsign2 = _mm_or_ps(vsp, vsn);
817+ vbase1 = vtmp1 = _mm_mul_ps(vtmp1, vsign1);
818+ vbase2 = vtmp2 = _mm_mul_ps(vtmp2, vsign2);
843819 vtmp1 = _mm_sqrt_ps(vtmp1);
844820 vtmp2 = _mm_sqrt_ps(vtmp2);
821+ vsp = _mm_and_ps(vtmp1, _mm_cmpgt_ps(vbase1, vvp1));
822+ vsn = _mm_and_ps(vbase1, _mm_cmple_ps(vbase1, vvp1));
823+ vtmp1 = _mm_or_ps(vsp, vsn);
824+ vsp = _mm_and_ps(vtmp2, _mm_cmpgt_ps(vbase2, vvp1));
825+ vsn = _mm_and_ps(vbase2, _mm_cmple_ps(vbase2, vvp1));
826+ vtmp2 = _mm_or_ps(vsp, vsn);
845827 vtmp1 = _mm_mul_ps(vtmp1, vsign1);
846828 vtmp2 = _mm_mul_ps(vtmp2, vsign2);
847- vtmp1 = _mm_add_ps(vtmp1, vbase1);
848- vtmp2 = _mm_add_ps(vtmp2, vbase2);
849829 vtmp1 = _mm_mul_ps(vtmp1, vlevel);
850830 vtmp2 = _mm_mul_ps(vtmp2, vlevel);
831+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
832+ _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(vtmp1));
833+ _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(vtmp2));
834+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
851835 _mm_storeu_pd(&sp[i], _mm_cvtps_pd(vtmp1));
852836 _mm_storeu_pd(&sp[i + 2], _mm_cvtps_pd(_mm_movehl_ps(vtmp1,vtmp1)));
853837 _mm_storeu_pd(&sp[i + 4], _mm_cvtps_pd(vtmp2));
854838 _mm_storeu_pd(&sp[i + 6], _mm_cvtps_pd(_mm_movehl_ps(vtmp2,vtmp2)));
855- }
856- }
857839 #elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
858- {
859- const int32 req_count_mask = ~(0x7);
860- int32 count2 = count & req_count_mask;
861- __m128 vgain = _mm_set1_ps((float)info->velgain);
862- __m128 vlevel = _mm_set1_ps((float)info->level);
863- const __m128 vvp1 = _mm_set1_ps(1.0);
864- const __m128 vvn1 = _mm_set1_ps(-1.0);
865- for (i = 0; i < count2; i += 8) {
866- __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
867- vtmp1 = _mm_loadu_ps(&sp[i]);
868- vtmp2 = _mm_loadu_ps(&sp[i + 4]);
869- vtmp1 = _mm_mul_ps(vtmp1, vgain);
870- vtmp2 = _mm_mul_ps(vtmp2, vgain);
871- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
872- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
873- vsign1 = _mm_or_ps(vsp, vsn);
874- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
875- vbase1 = _mm_and_ps(vtmp1, vme);
876- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
877- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
878- vsign2 = _mm_or_ps(vsp, vsn);
879- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
880- vbase2 = _mm_and_ps(vtmp2, vme);
881- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
882- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
883- vtmp1 = _mm_sqrt_ps(vtmp1);
884- vtmp2 = _mm_sqrt_ps(vtmp2);
885- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
886- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
887- vtmp1 = _mm_add_ps(vtmp1, vbase1);
888- vtmp2 = _mm_add_ps(vtmp2, vbase2);
889- vtmp1 = _mm_mul_ps(vtmp1, vlevel);
890- vtmp2 = _mm_mul_ps(vtmp2, vlevel);
891840 _mm_storeu_ps(&sp[i], vtmp1);
892841 _mm_storeu_ps(&sp[i + 4], vtmp2);
842+#endif
893843 }
894844 }
895845 #endif // USE_X86_EXT_INTRIN
@@ -903,64 +853,7 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
903853 }
904854 break;
905855 case 4:
906-#if 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
907- {
908- const int32 req_count_mask = ~(0x7);
909- int32 count2 = count & req_count_mask;
910- __m256 vgain = _mm256_set1_ps((float)info->velgain);
911- __m256 vlevel = _mm256_set1_ps((float)info->level);
912- const __m256 vvp1 = _mm256_set1_ps(1.0);
913- const __m256 vvn1 = _mm256_set1_ps(-1.0);
914- const __m256 vvp2 = _mm256_set1_ps(2.0);
915- for (i = 0; i < count2; i += 8) {
916- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
917- vtmp1 = MM256_SET2X_PS(
918- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i])),
919- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4])) );
920- vtmp1 = _mm256_mul_ps(vtmp1, vgain);
921- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
922- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
923- vsign1 = _mm256_or_ps(vsp, vsn);
924- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
925- vbase1 = _mm256_and_ps(vtmp1, vme);
926- vtmp1 = _mm256_mul_ps(vtmp1, vsign1)
927- vtmp1 = _mm256_sqrt_ps(vtmp1);
928- vtmp1 = _mm256_sub_ps(vvp2, vtmp1);
929- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
930- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
931- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
932- _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x0)));
933- _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x1)));
934- }
935- }
936-#elif 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_FLOAT)
937- {
938- const int32 req_count_mask = ~(0x7);
939- int32 count2 = count & req_count_mask;
940- __m256 vgain = _mm256_set1_ps((float)info->velgain);
941- __m256 vlevel = _mm256_set1_ps((float)info->level);
942- const __m256 vvp1 = _mm256_set1_ps(1.0);
943- const __m256 vvn1 = _mm256_set1_ps(-1.0);
944- const __m256 vvp2 = _mm256_set1_ps(2.0);
945- for (i = 0; i < count2; i += 8) {
946- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
947- vtmp1 = _mm256_loadu_ps(&sp[i]);
948- vtmp1 = _mm256_mul_ps(vtmp1, vgain);
949- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
950- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
951- vsign1 = _mm256_or_ps(vsp, vsn);
952- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
953- vbase1 = _mm256_and_ps(vtmp1, vme);
954- vtmp1 = _mm256_mul_ps(vtmp1, vsign1)
955- vtmp1 = _mm256_sqrt_ps(vtmp1);
956- vtmp1 = _mm256_sub_ps(vvp2, vtmp1);
957- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
958- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
959- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
960- _mm256_storeu_ps(&sp[i], vtmp1);
961- }
962- }
963-#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
856+#if (USE_X86_EXT_INTRIN >= 2) && (defined(DATA_T_DOUBLE) || defined(DATA_T_FLOAT))
964857 {
965858 const int32 req_count_mask = ~(0x7);
966859 int32 count2 = count & req_count_mask;
@@ -970,83 +863,69 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
970863 const __m128 vvn1 = _mm_set1_ps(-1.0);
971864 const __m128 vvp2 = _mm_set1_ps(2.0);
972865 for (i = 0; i < count2; i += 8) {
866+/*
867+tmp = sp[i] * info->velgain;
868+sp = tmp >= 0 ? (1.0) : (0.0);
869+sn = tmp < 0 ? (-1.0) : (0.0);
870+sign = sp | sn;
871+base = tmp = tmp * sign;
872+tmp = 2.0 - sqrt(tmp);
873+sp = base > 1.0 ? tmp : 0;
874+sn = base <= 1.0 ? base : 0;
875+tmp = sp | sn;
876+sp[i] = tmp * sign * info->level;
877+*/
973878 __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
974- __m128d vtmpd1, vtmpd2;
879+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
880+ vtmp1 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i]));
881+ vtmp2 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4]));
882+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
975883 vtmp1 = _mm_shuffle_ps(
976884 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i])),
977885 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 2])), 0x44);
978886 vtmp2 = _mm_shuffle_ps(
979887 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 4])),
980888 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 6])), 0x44);
889+#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
890+ vtmp1 = _mm_loadu_ps(&sp[i]);
891+ vtmp2 = _mm_loadu_ps(&sp[i + 4]);
892+#endif
981893 vtmp1 = _mm_mul_ps(vtmp1, vgain);
982- vtmp2 = _mm_mul_ps(vtmp2, vgain);
983- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
984- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
985- vsign1 = _mm_or_ps(vsp, vsn);
986- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
987- vbase1 = _mm_and_ps(vtmp1, vme);
988- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
989- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
990- vsign2 = _mm_or_ps(vsp, vsn);
991- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
992- vbase2 = _mm_and_ps(vtmp2, vme);
993- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
994- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
894+ vtmp2 = _mm_mul_ps(vtmp2, vgain);
895+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp1, _mm_setzero_ps()));
896+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, _mm_setzero_ps()));
897+ vsign1 = _mm_or_ps(vsp, vsn);
898+ vsp = _mm_and_ps(vvp1, _mm_cmpge_ps(vtmp2, _mm_setzero_ps()));
899+ vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, _mm_setzero_ps()));
900+ vsign2 = _mm_or_ps(vsp, vsn);
901+ vbase1 = vtmp1 = _mm_mul_ps(vtmp1, vsign1);
902+ vbase2 = vtmp2 = _mm_mul_ps(vtmp2, vsign2);
995903 vtmp1 = _mm_sqrt_ps(vtmp1);
996904 vtmp2 = _mm_sqrt_ps(vtmp2);
997905 vtmp1 = _mm_sub_ps(vvp2, vtmp1);
998906 vtmp2 = _mm_sub_ps(vvp2, vtmp2);
907+ vsp = _mm_and_ps(vtmp1, _mm_cmpgt_ps(vbase1, vvp1));
908+ vsn = _mm_and_ps(vbase1, _mm_cmple_ps(vbase1, vvp1));
909+ vtmp1 = _mm_or_ps(vsp, vsn);
910+ vsp = _mm_and_ps(vtmp2, _mm_cmpgt_ps(vbase2, vvp1));
911+ vsn = _mm_and_ps(vbase2, _mm_cmple_ps(vbase2, vvp1));
912+ vtmp2 = _mm_or_ps(vsp, vsn);
999913 vtmp1 = _mm_mul_ps(vtmp1, vsign1);
1000914 vtmp2 = _mm_mul_ps(vtmp2, vsign2);
1001- vtmp1 = _mm_add_ps(vtmp1, vbase1);
1002- vtmp2 = _mm_add_ps(vtmp2, vbase2);
1003915 vtmp1 = _mm_mul_ps(vtmp1, vlevel);
1004916 vtmp2 = _mm_mul_ps(vtmp2, vlevel);
917+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
918+ _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(vtmp1));
919+ _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(vtmp2));
920+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
1005921 _mm_storeu_pd(&sp[i], _mm_cvtps_pd(vtmp1));
1006922 _mm_storeu_pd(&sp[i + 2], _mm_cvtps_pd(_mm_movehl_ps(vtmp1,vtmp1)));
1007923 _mm_storeu_pd(&sp[i + 4], _mm_cvtps_pd(vtmp2));
1008924 _mm_storeu_pd(&sp[i + 6], _mm_cvtps_pd(_mm_movehl_ps(vtmp2,vtmp2)));
1009- }
1010- }
1011925 #elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
1012- {
1013- const int32 req_count_mask = ~(0x7);
1014- int32 count2 = count & req_count_mask;
1015- __m128 vgain = _mm_set1_ps((float)info->velgain);
1016- __m128 vlevel = _mm_set1_ps((float)info->level);
1017- const __m128 vvp1 = _mm_set1_ps(1.0);
1018- const __m128 vvn1 = _mm_set1_ps(-1.0);
1019- const __m128 vvp2 = _mm_set1_ps(2.0);
1020- for (i = 0; i < count2; i += 8) {
1021- __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
1022- vtmp1 = _mm_loadu_ps(&sp[i]);
1023- vtmp2 = _mm_loadu_ps(&sp[i + 4]);
1024- vtmp1 = _mm_mul_ps(vtmp1, vgain);
1025- vtmp2 = _mm_mul_ps(vtmp2, vgain);
1026- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
1027- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
1028- vsign1 = _mm_or_ps(vsp, vsn);
1029- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
1030- vbase1 = _mm_and_ps(vtmp1, vme);
1031- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
1032- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
1033- vsign2 = _mm_or_ps(vsp, vsn);
1034- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
1035- vbase2 = _mm_and_ps(vtmp2, vme);
1036- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
1037- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
1038- vtmp1 = _mm_sqrt_ps(vtmp1);
1039- vtmp2 = _mm_sqrt_ps(vtmp2);
1040- vtmp1 = _mm_sub_ps(vvp2, vtmp1);
1041- vtmp2 = _mm_sub_ps(vvp2, vtmp2);
1042- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
1043- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
1044- vtmp1 = _mm_add_ps(vtmp1, vbase1);
1045- vtmp2 = _mm_add_ps(vtmp2, vbase2);
1046- vtmp1 = _mm_mul_ps(vtmp1, vlevel);
1047- vtmp2 = _mm_mul_ps(vtmp2, vlevel);
1048926 _mm_storeu_ps(&sp[i], vtmp1);
1049927 _mm_storeu_ps(&sp[i + 4], vtmp2);
928+#endif
1050929 }
1051930 }
1052931 #endif // USE_X86_EXT_INTRIN
@@ -1068,163 +947,55 @@ static inline void do_vfx_distortion(int v, VoiceEffect *vfx, DATA_T *sp, int32
1068947 }
1069948 break;
1070949 case 6:
1071-#if 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
1072- {
1073- const int32 req_count_mask = ~(0x7);
1074- int32 count2 = count & req_count_mask;
1075- __m256 vgain = _mm256_set1_ps((float)info->velgain);
1076- __m256 vlevel = _mm256_set1_ps((float)info->level);
1077- const __m256 vvp1 = _mm256_set1_ps(1.0);
1078- const __m256 vvn1 = _mm256_set1_ps(-1.0);
1079- const __m256 vvq = _mm256_set1_ps(-0.15);
1080- const __m256 vv11 = _mm256_set1_ps(1.1);
1081- const __m256 vmsign = _mm256_set1_ps(-0.0f);
1082- for (i = 0; i < count2; i += 8) {
1083- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
1084- vtmp1 = MM256_SET2X_PS(
1085- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i])),
1086- _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4])) );
1087- vtmp1 = _mm256_mul_ps(vtmp1, vgain);
1088- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
1089- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
1090- vsign1 = _mm256_or_ps(vsp, vsn);
1091- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
1092- vbase1 = _mm256_and_ps(vtmp1, vme);
1093- vtmp1 = _mm256_mul_ps(vtmp1, vsign1)
1094- vtmp1 = _mm256_andnot_ps(vtmp1, vmsign);
1095- vtmp1 = _mm256_mul_ps(vtmp1, vvq);
1096- vtmp1 = _mm256_add_ps(vtmp1, vv11);
1097- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
1098- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
1099- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
1100- _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x0)));
1101- _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(_mm256_extract_ps(vtmp1, 0x1)));
1102- }
1103- }
1104-#elif 0 && (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_FLOAT)
1105- {
1106- const int32 req_count_mask = ~(0x7);
1107- int32 count2 = count & req_count_mask;
1108- __m256 vgain = _mm256_set1_ps((float)info->velgain);
1109- __m256 vlevel = _mm256_set1_ps((float)info->level);
1110- const __m256 vvp1 = _mm256_set1_ps(1.0);
1111- const __m256 vvn1 = _mm256_set1_ps(-1.0);
1112- const __m256 vvq = _mm256_set1_ps(-0.15);
1113- const __m256 vv11 = _mm256_set1_ps(1.1);
1114- const __m256 vmsign = _mm256_set1_ps(-0.0f);
1115- for (i = 0; i < count2; i += 8) {
1116- __m256 vtmp1, vme, vsp, vsn, vsign1, vbase1;
1117- vtmp1 = _mm256_loadu_ps(&sp[i]);
1118- vtmp1 = _mm256_mul_ps(vtmp1, vgain);
1119- vsp = _mm256_and_ps(vvp1, _mm256_cmpgt_ps(vtmp1, vvp1));
1120- vsn = _mm256_and_ps(vvn1, _mm256_cmplt_ps(vtmp1, vvn1));
1121- vsign1 = _mm256_or_ps(vsp, vsn);
1122- vme = _mm256_and_ps(_mm256_cmple_ps(vtmp1, vvp1), _mm256_cmpge_ps(vtmp1, vvn1));
1123- vbase1 = _mm256_and_ps(vtmp1, vme);
1124- vtmp1 = _mm256_mul_ps(vtmp1, vsign1)
1125- vtmp1 = _mm256_andnot_ps(vtmp1, vmsign);
1126- vtmp1 = _mm256_mul_ps(vtmp1, vvq);
1127- vtmp1 = _mm256_add_ps(vtmp1, vv11);
1128- vtmp1 = _mm256_mul_ps(vtmp1, vsign1);
1129- vtmp1 = _mm256_add_ps(vtmp1, vbase1);
1130- vtmp1 = _mm256_mul_ps(vtmp1, vlevel);
1131- _mm256_storeu_ps(&sp[i], vtmp1);
1132- }
1133- }
1134-#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
950+#if (USE_X86_EXT_INTRIN >= 2) && (defined(DATA_T_DOUBLE) || defined(DATA_T_FLOAT))
1135951 {
1136952 const int32 req_count_mask = ~(0x7);
1137953 int32 count2 = count & req_count_mask;
1138954 __m128 vgain = _mm_set1_ps((float)info->velgain);
1139955 __m128 vlevel = _mm_set1_ps((float)info->level);
1140- const __m128 vvp1 = _mm_set1_ps(1.0);
1141- const __m128 vvn1 = _mm_set1_ps(-1.0);
1142956 const __m128 vvq = _mm_set1_ps(-0.15);
1143957 const __m128 vv11 = _mm_set1_ps(1.1);
1144958 const __m128 vmsign = _mm_set1_ps(-0.0f);
1145959 for (i = 0; i < count2; i += 8) {
1146- __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
960+ __m128 vtmp1, vtmp2, vbase1, vbase2;
961+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
962+ vtmp1 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i]));
963+ vtmp2 = _mm256_cvtpd_ps(_mm256_loadu_pd(&sp[i + 4]));
964+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
1147965 vtmp1 = _mm_shuffle_ps(
1148966 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i])),
1149967 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 2])), 0x44);
1150968 vtmp2 = _mm_shuffle_ps(
1151969 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 4])),
1152970 _mm_cvtpd_ps(_mm_loadu_pd(&sp[i + 6])), 0x44);
1153- vtmp1 = _mm_mul_ps(vtmp1, vgain);
1154- vtmp2 = _mm_mul_ps(vtmp2, vgain);
1155- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
1156- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
1157- vsign1 = _mm_or_ps(vsp, vsn);
1158- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
1159- vbase1 = _mm_and_ps(vtmp1, vme);
1160- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
1161- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
1162- vsign2 = _mm_or_ps(vsp, vsn);
1163- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
1164- vbase2 = _mm_and_ps(vtmp2, vme);
1165- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
1166- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
1167- vtmp1 = _mm_andnot_ps(vtmp1, vmsign);
1168- vtmp2 = _mm_andnot_ps(vtmp2, vmsign);
971+#elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
972+ vtmp1 = _mm_loadu_ps(&sp[i]);
973+ vtmp2 = _mm_loadu_ps(&sp[i + 4]);
974+#endif
975+ vbase1 = vtmp1 = _mm_mul_ps(vtmp1, vgain);
976+ vbase2 = vtmp2 = _mm_mul_ps(vtmp2, vgain);
977+ vtmp1 = _mm_andnot_ps(vtmp1, vmsign); // fabs
978+ vtmp2 = _mm_andnot_ps(vtmp2, vmsign); // fabs
1169979 vtmp1 = _mm_mul_ps(vtmp1, vvq);
1170980 vtmp2 = _mm_mul_ps(vtmp2, vvq);
1171981 vtmp1 = _mm_add_ps(vtmp1, vv11);
1172982 vtmp2 = _mm_add_ps(vtmp2, vv11);
1173- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
1174- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
1175983 vtmp1 = _mm_add_ps(vtmp1, vbase1);
1176984 vtmp2 = _mm_add_ps(vtmp2, vbase2);
1177985 vtmp1 = _mm_mul_ps(vtmp1, vlevel);
1178986 vtmp2 = _mm_mul_ps(vtmp2, vlevel);
987+#if (USE_X86_EXT_INTRIN >= 8) && defined(DATA_T_DOUBLE)
988+ _mm256_storeu_pd(&sp[i], _mm256_cvtps_pd(vtmp1));
989+ _mm256_storeu_pd(&sp[i + 4], _mm256_cvtps_pd(vtmp2));
990+#elif (USE_X86_EXT_INTRIN >= 3) && defined(DATA_T_DOUBLE)
1179991 _mm_storeu_pd(&sp[i], _mm_cvtps_pd(vtmp1));
1180992 _mm_storeu_pd(&sp[i + 2], _mm_cvtps_pd(_mm_movehl_ps(vtmp1,vtmp1)));
1181993 _mm_storeu_pd(&sp[i + 4], _mm_cvtps_pd(vtmp2));
1182994 _mm_storeu_pd(&sp[i + 6], _mm_cvtps_pd(_mm_movehl_ps(vtmp2,vtmp2)));
1183- }
1184- }
1185995 #elif (USE_X86_EXT_INTRIN >= 2) && defined(DATA_T_FLOAT)
1186- {
1187- const int32 req_count_mask = ~(0x7);
1188- int32 count2 = count & req_count_mask;
1189- __m128 vgain = _mm_set1_ps((float)info->velgain);
1190- __m128 vlevel = _mm_set1_ps((float)info->level);
1191- const __m128 vvp1 = _mm_set1_ps(1.0);
1192- const __m128 vvn1 = _mm_set1_ps(-1.0);
1193- const __m128 vvq = _mm_set1_ps(-0.15);
1194- const __m128 vv11 = _mm_set1_ps(1.1);
1195- const __m128 vmsign = _mm_set1_ps(-0.0f);
1196- for (i = 0; i < count2; i += 8) {
1197- __m128 vtmp1, vtmp2, vme, vsp, vsn, vsign1, vsign2, vbase1, vbase2;
1198- vtmp1 = _mm_loadu_ps(&sp[i]);
1199- vtmp2 = _mm_loadu_ps(&sp[i + 4]);
1200- vtmp1 = _mm_mul_ps(vtmp1, vgain);
1201- vtmp2 = _mm_mul_ps(vtmp2, vgain);
1202- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp1, vvp1));
1203- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp1, vvn1));
1204- vsign1 = _mm_or_ps(vsp, vsn);
1205- vme = _mm_and_ps(_mm_cmple_ps(vtmp1, vvp1), _mm_cmpge_ps(vtmp1, vvn1));
1206- vbase1 = _mm_and_ps(vtmp1, vme);
1207- vsp = _mm_and_ps(vvp1, _mm_cmpgt_ps(vtmp2, vvp1));
1208- vsn = _mm_and_ps(vvn1, _mm_cmplt_ps(vtmp2, vvn1));
1209- vsign2 = _mm_or_ps(vsp, vsn);
1210- vme = _mm_and_ps(_mm_cmple_ps(vtmp2, vvp1), _mm_cmpge_ps(vtmp2, vvn1));
1211- vbase2 = _mm_and_ps(vtmp2, vme);
1212- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
1213- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
1214- vtmp1 = _mm_andnot_ps(vtmp1, vmsign);
1215- vtmp2 = _mm_andnot_ps(vtmp2, vmsign);
1216- vtmp1 = _mm_mul_ps(vtmp1, vvq);
1217- vtmp2 = _mm_mul_ps(vtmp2, vvq);
1218- vtmp1 = _mm_add_ps(vtmp1, vv11);
1219- vtmp2 = _mm_add_ps(vtmp2, vv11);
1220- vtmp1 = _mm_mul_ps(vtmp1, vsign1);
1221- vtmp2 = _mm_mul_ps(vtmp2, vsign2);
1222- vtmp1 = _mm_add_ps(vtmp1, vbase1);
1223- vtmp2 = _mm_add_ps(vtmp2, vbase2);
1224- vtmp1 = _mm_mul_ps(vtmp1, vlevel);
1225- vtmp2 = _mm_mul_ps(vtmp2, vlevel);
1226996 _mm_storeu_ps(&sp[i], vtmp1);
1227997 _mm_storeu_ps(&sp[i + 4], vtmp2);
998+#endif
1228999 }
12291000 }
12301001 #endif // USE_X86_EXT_INTRIN
--- a/timidity/wasapi_a.c
+++ b/timidity/wasapi_a.c
@@ -578,6 +578,10 @@ static int get_device(IMMDevice **ppMMDevice, int devnum)
578578 goto error;
579579 if(pszDeviceId)
580580 CoTaskMemFree(pszDeviceId);
581+ if(pdev)
582+ IMMDevice_Release(pdev);
583+ if(pdc)
584+ IMMDeviceCollection_Release(pdc);
581585 if(pde)
582586 IMMDeviceEnumerator_Release(pde);
583587 return TRUE;
@@ -585,6 +589,10 @@ static int get_device(IMMDevice **ppMMDevice, int devnum)
585589 error:
586590 if(pszDeviceId)
587591 CoTaskMemFree(pszDeviceId);
592+ if(pdev)
593+ IMMDevice_Release(pdev);
594+ if(pdc)
595+ IMMDeviceCollection_Release(pdc);
588596 if(pde)
589597 IMMDeviceEnumerator_Release(pde);
590598 return FALSE;
@@ -637,7 +645,7 @@ static void print_device_list(void)
637645 device[0].LatencyMax = LatencyMax;
638646 device[0].LatencyMin = LatencyMin;
639647 if(tmpClient){
640- tmpClient->lpVtbl->Release(tmpClient);
648+ IAudioClient_Release(tmpClient);
641649 tmpClient = NULL;
642650 }
643651 if(defdev){
@@ -688,7 +696,7 @@ static void print_device_list(void)
688696 device[i+1].LatencyMax = LatencyMax;
689697 device[i+1].LatencyMin = LatencyMin;
690698 if(tmpClient){
691- tmpClient->lpVtbl->Release(tmpClient);
699+ IAudioClient_Release(tmpClient);
692700 tmpClient = NULL;
693701 }
694702 if(dev){
@@ -696,12 +704,12 @@ static void print_device_list(void)
696704 dev = NULL;
697705 }
698706 if(pps){
699- pps->lpVtbl->Release(pps);
707+ IPropertyStore_Release(pps);
700708 pps = NULL;
701709 }
702710 }
703711 if(pdc)
704- pdc->lpVtbl->Release(pdc);
712+ IMMDeviceCollection_Release(pdc);
705713 if(pde)
706714 IMMDeviceEnumerator_Release(pde);
707715 for(i = 0; i < num; i++){
@@ -713,9 +721,9 @@ static void print_device_list(void)
713721 return;
714722 error1:
715723 if(tmpClient)
716- tmpClient->lpVtbl->Release(tmpClient);
724+ IAudioClient_Release(tmpClient);
717725 if(pdc){
718- pdc->lpVtbl->Release(pdc);
726+ IMMDeviceCollection_Release(pdc);
719727 }
720728 if(pde)
721729 IMMDeviceEnumerator_Release(pde);
@@ -1226,7 +1234,7 @@ int wasapi_device_list(WASAPI_DEVICELIST *device)
12261234 device[0].LatencyMax = LatencyMax;
12271235 device[0].LatencyMin = LatencyMin;
12281236 if(tmpClient){
1229- tmpClient->lpVtbl->Release(tmpClient);
1237+ IAudioClient_Release(tmpClient);
12301238 tmpClient = NULL;
12311239 }
12321240 if(defdev){
@@ -1277,7 +1285,7 @@ int wasapi_device_list(WASAPI_DEVICELIST *device)
12771285 device[i+1].LatencyMax = LatencyMax;
12781286 device[i+1].LatencyMin = LatencyMin;
12791287 if(tmpClient){
1280- tmpClient->lpVtbl->Release(tmpClient);
1288+ IAudioClient_Release(tmpClient);
12811289 tmpClient = NULL;
12821290 }
12831291 if(dev){
@@ -1285,21 +1293,21 @@ int wasapi_device_list(WASAPI_DEVICELIST *device)
12851293 dev = NULL;
12861294 }
12871295 if(pps){
1288- pps->lpVtbl->Release(pps);
1296+ IPropertyStore_Release(pps);
12891297 pps = NULL;
12901298 }
12911299 }
12921300 if(pdc)
1293- pdc->lpVtbl->Release(pdc);
1301+ IMMDeviceCollection_Release(pdc);
12941302 if(pde)
12951303 IMMDeviceEnumerator_Release(pde);
12961304 return num + 1; // +1 def dev
12971305
12981306 error1:
12991307 if(tmpClient)
1300- tmpClient->lpVtbl->Release(tmpClient);
1308+ IAudioClient_Release(tmpClient);
13011309 if(pdc){
1302- pdc->lpVtbl->Release(pdc);
1310+ IMMDeviceCollection_Release(pdc);
13031311 }
13041312 if(pde)
13051313 IMMDeviceEnumerator_Release(pde);
Show on old repository browser