• R/O
  • HTTP
  • SSH
  • HTTPS

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

Revisione7e96fc5ec8c79dc77fef522d5226ac09f684ba5 (tree)
Time2020-02-22 01:07:02
AuthorRichard Henderson <richard.henderson@lina...>
CommiterPeter Maydell

Log Message

target/arm: Convert PMULL.8 to gvec

We still need two different helpers, since NEON and SVE2 get the
inputs from different locations within the source vector. However,
we can convert both to the same internal form for computation.

The sve2 helper is not used yet, but adding it with this patch
helps illustrate why the neon changes are helpful.

Tested-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20200216214232.4230-5-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

Change Summary

Incremental Difference

--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -1574,3 +1574,5 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG,
15741574 void, env, ptr, ptr, ptr, tl, i32)
15751575 DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG,
15761576 void, env, ptr, ptr, ptr, tl, i32)
1577+
1578+DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -342,7 +342,6 @@ DEF_HELPER_2(neon_sub_u8, i32, i32, i32)
342342 DEF_HELPER_2(neon_sub_u16, i32, i32, i32)
343343 DEF_HELPER_2(neon_mul_u8, i32, i32, i32)
344344 DEF_HELPER_2(neon_mul_u16, i32, i32, i32)
345-DEF_HELPER_2(neon_mull_p8, i64, i32, i32)
346345
347346 DEF_HELPER_2(neon_tst_u8, i32, i32, i32)
348347 DEF_HELPER_2(neon_tst_u16, i32, i32, i32)
@@ -695,6 +694,8 @@ DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
695694 DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
696695 DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
697696
697+DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
698+
698699 #ifdef TARGET_AARCH64
699700 #include "helper-a64.h"
700701 #include "helper-sve.h"
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -1129,38 +1129,6 @@ NEON_VOP(mul_u8, neon_u8, 4)
11291129 NEON_VOP(mul_u16, neon_u16, 2)
11301130 #undef NEON_FN
11311131
1132-/* Polynomial multiplication is like integer multiplication except the
1133- partial products are XORed, not added. */
1134-uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
1135-{
1136- uint64_t result = 0;
1137- uint64_t mask;
1138- uint64_t op2ex = op2;
1139- op2ex = (op2ex & 0xff) |
1140- ((op2ex & 0xff00) << 8) |
1141- ((op2ex & 0xff0000) << 16) |
1142- ((op2ex & 0xff000000) << 24);
1143- while (op1) {
1144- mask = 0;
1145- if (op1 & 1) {
1146- mask |= 0xffff;
1147- }
1148- if (op1 & (1 << 8)) {
1149- mask |= (0xffffU << 16);
1150- }
1151- if (op1 & (1 << 16)) {
1152- mask |= (0xffffULL << 32);
1153- }
1154- if (op1 & (1 << 24)) {
1155- mask |= (0xffffULL << 48);
1156- }
1157- result ^= op2ex & mask;
1158- op1 = (op1 >> 1) & 0x7f7f7f7f;
1159- op2ex <<= 1;
1160- }
1161- return result;
1162-}
1163-
11641132 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
11651133 NEON_VOP(tst_u8, neon_u8, 4)
11661134 NEON_VOP(tst_u16, neon_u16, 2)
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10542,10 +10542,6 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
1054210542 gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env,
1054310543 tcg_passres, tcg_passres);
1054410544 break;
10545- case 14: /* PMULL */
10546- assert(size == 0);
10547- gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2);
10548- break;
1054910545 default:
1055010546 g_assert_not_reached();
1055110547 }
@@ -10709,11 +10705,21 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
1070910705 handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm);
1071010706 break;
1071110707 case 14: /* PMULL, PMULL2 */
10712- if (is_u || size == 1 || size == 2) {
10708+ if (is_u) {
1071310709 unallocated_encoding(s);
1071410710 return;
1071510711 }
10716- if (size == 3) {
10712+ switch (size) {
10713+ case 0: /* PMULL.P8 */
10714+ if (!fp_access_check(s)) {
10715+ return;
10716+ }
10717+ /* The Q field specifies lo/hi half input for this insn. */
10718+ gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
10719+ gen_helper_neon_pmull_h);
10720+ break;
10721+
10722+ case 3: /* PMULL.P64 */
1071710723 if (!dc_isar_feature(aa64_pmull, s)) {
1071810724 unallocated_encoding(s);
1071910725 return;
@@ -10724,9 +10730,13 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
1072410730 /* The Q field specifies lo/hi half input for this insn. */
1072510731 gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
1072610732 gen_helper_gvec_pmull_q);
10727- return;
10733+ break;
10734+
10735+ default:
10736+ unallocated_encoding(s);
10737+ break;
1072810738 }
10729- goto is_widening;
10739+ return;
1073010740 case 9: /* SQDMLAL, SQDMLAL2 */
1073110741 case 11: /* SQDMLSL, SQDMLSL2 */
1073210742 case 13: /* SQDMULL, SQDMULL2 */
@@ -10747,7 +10757,6 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
1074710757 unallocated_encoding(s);
1074810758 return;
1074910759 }
10750- is_widening:
1075110760 if (!fp_access_check(s)) {
1075210761 return;
1075310762 }
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5866,15 +5866,20 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
58665866 return 1;
58675867 }
58685868
5869- /* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply)
5870- * outside the loop below as it only performs a single pass.
5871- */
5872- if (op == 14 && size == 2) {
5873- if (!dc_isar_feature(aa32_pmull, s)) {
5874- return 1;
5869+ /* Handle polynomial VMULL in a single pass. */
5870+ if (op == 14) {
5871+ if (size == 0) {
5872+ /* VMULL.P8 */
5873+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
5874+ 0, gen_helper_neon_pmull_h);
5875+ } else {
5876+ /* VMULL.P64 */
5877+ if (!dc_isar_feature(aa32_pmull, s)) {
5878+ return 1;
5879+ }
5880+ tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
5881+ 0, gen_helper_gvec_pmull_q);
58755882 }
5876- tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
5877- 0, gen_helper_gvec_pmull_q);
58785883 return 0;
58795884 }
58805885
@@ -5952,11 +5957,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
59525957 /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */
59535958 gen_neon_mull(cpu_V0, tmp, tmp2, size, u);
59545959 break;
5955- case 14: /* Polynomial VMULL */
5956- gen_helper_neon_mull_p8(cpu_V0, tmp, tmp2);
5957- tcg_temp_free_i32(tmp2);
5958- tcg_temp_free_i32(tmp);
5959- break;
59605960 default: /* 15 is RESERVED: caught earlier */
59615961 abort();
59625962 }
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1197,3 +1197,63 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
11971197 }
11981198 clear_tail(d, opr_sz, simd_maxsz(desc));
11991199 }
1200+
1201+/*
1202+ * 8x8->16 polynomial multiply.
1203+ *
1204+ * The byte inputs are expanded to (or extracted from) half-words.
1205+ * Note that neon and sve2 get the inputs from different positions.
1206+ * This allows 4 bytes to be processed in parallel with uint64_t.
1207+ */
1208+
1209+static uint64_t expand_byte_to_half(uint64_t x)
1210+{
1211+ return (x & 0x000000ff)
1212+ | ((x & 0x0000ff00) << 8)
1213+ | ((x & 0x00ff0000) << 16)
1214+ | ((x & 0xff000000) << 24);
1215+}
1216+
1217+static uint64_t pmull_h(uint64_t op1, uint64_t op2)
1218+{
1219+ uint64_t result = 0;
1220+ int i;
1221+
1222+ for (i = 0; i < 8; ++i) {
1223+ uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
1224+ result ^= op2 & mask;
1225+ op1 >>= 1;
1226+ op2 <<= 1;
1227+ }
1228+ return result;
1229+}
1230+
1231+void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
1232+{
1233+ int hi = simd_data(desc);
1234+ uint64_t *d = vd, *n = vn, *m = vm;
1235+ uint64_t nn = n[hi], mm = m[hi];
1236+
1237+ d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
1238+ nn >>= 32;
1239+ mm >>= 32;
1240+ d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
1241+
1242+ clear_tail(d, 16, simd_maxsz(desc));
1243+}
1244+
1245+#ifdef TARGET_AARCH64
1246+void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
1247+{
1248+ int shift = simd_data(desc) * 8;
1249+ intptr_t i, opr_sz = simd_oprsz(desc);
1250+ uint64_t *d = vd, *n = vn, *m = vm;
1251+
1252+ for (i = 0; i < opr_sz / 8; ++i) {
1253+ uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
1254+ uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
1255+
1256+ d[i] = pmull_h(nn, mm);
1257+ }
1258+}
1259+#endif