• R/O
  • HTTP
  • SSH
  • HTTPS

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

Revision14e4c1e2355473ccb2939afc69ac8f25de103b92 (tree)
Time2018-02-09 00:54:08
AuthorRichard Henderson <richard.henderson@lina...>
CommiterRichard Henderson

Log Message

tcg/aarch64: Add vector operations

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Change Summary

Incremental Difference

--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -31,13 +31,22 @@ typedef enum {
3131 TCG_REG_SP = 31,
3232 TCG_REG_XZR = 31,
3333
34+ TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
35+ TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
36+ TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
37+ TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
38+ TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
39+ TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
40+ TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
41+ TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
42+
3443 /* Aliases. */
3544 TCG_REG_FP = TCG_REG_X29,
3645 TCG_REG_LR = TCG_REG_X30,
3746 TCG_AREG0 = TCG_REG_X19,
3847 } TCGReg;
3948
40-#define TCG_TARGET_NB_REGS 32
49+#define TCG_TARGET_NB_REGS 64
4150
4251 /* used for function call generation */
4352 #define TCG_REG_CALL_STACK TCG_REG_SP
@@ -113,6 +122,20 @@ typedef enum {
113122 #define TCG_TARGET_HAS_mulsh_i64 1
114123 #define TCG_TARGET_HAS_direct_jump 1
115124
125+#define TCG_TARGET_HAS_v64 1
126+#define TCG_TARGET_HAS_v128 1
127+#define TCG_TARGET_HAS_v256 0
128+
129+#define TCG_TARGET_HAS_andc_vec 1
130+#define TCG_TARGET_HAS_orc_vec 1
131+#define TCG_TARGET_HAS_not_vec 1
132+#define TCG_TARGET_HAS_neg_vec 1
133+#define TCG_TARGET_HAS_shi_vec 1
134+#define TCG_TARGET_HAS_shs_vec 0
135+#define TCG_TARGET_HAS_shv_vec 0
136+#define TCG_TARGET_HAS_cmp_vec 1
137+#define TCG_TARGET_HAS_mul_vec 1
138+
116139 #define TCG_TARGET_DEFAULT_MO (0)
117140
118141 static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -20,10 +20,15 @@ QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1);
2020
2121 #ifdef CONFIG_DEBUG_TCG
2222 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
23- "%x0", "%x1", "%x2", "%x3", "%x4", "%x5", "%x6", "%x7",
24- "%x8", "%x9", "%x10", "%x11", "%x12", "%x13", "%x14", "%x15",
25- "%x16", "%x17", "%x18", "%x19", "%x20", "%x21", "%x22", "%x23",
26- "%x24", "%x25", "%x26", "%x27", "%x28", "%fp", "%x30", "%sp",
23+ "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
24+ "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
25+ "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
26+ "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp",
27+
28+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
29+ "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
30+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
31+ "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31",
2732 };
2833 #endif /* CONFIG_DEBUG_TCG */
2934
@@ -43,6 +48,14 @@ static const int tcg_target_reg_alloc_order[] = {
4348 /* X19 reserved for AREG0 */
4449 /* X29 reserved as fp */
4550 /* X30 reserved as temporary */
51+
52+ TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
53+ TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
54+ /* V8 - V15 are call-saved, and skipped. */
55+ TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
56+ TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
57+ TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
58+ TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
4659 };
4760
4861 static const int tcg_target_call_iarg_regs[8] = {
@@ -54,6 +67,7 @@ static const int tcg_target_call_oarg_regs[1] = {
5467 };
5568
5669 #define TCG_REG_TMP TCG_REG_X30
70+#define TCG_VEC_TMP TCG_REG_V31
5771
5872 #ifndef CONFIG_SOFTMMU
5973 /* Note that XZR cannot be encoded in the address base register slot,
@@ -119,9 +133,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
119133 const char *ct_str, TCGType type)
120134 {
121135 switch (*ct_str++) {
122- case 'r':
136+ case 'r': /* general registers */
123137 ct->ct |= TCG_CT_REG;
124- ct->u.regs = 0xffffffffu;
138+ ct->u.regs |= 0xffffffffu;
139+ break;
140+ case 'w': /* advsimd registers */
141+ ct->ct |= TCG_CT_REG;
142+ ct->u.regs |= 0xffffffff00000000ull;
125143 break;
126144 case 'l': /* qemu_ld / qemu_st address, data_reg */
127145 ct->ct |= TCG_CT_REG;
@@ -153,11 +171,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
153171 return ct_str;
154172 }
155173
174+/* Match a constant valid for addition (12-bit, optionally shifted). */
156175 static inline bool is_aimm(uint64_t val)
157176 {
158177 return (val & ~0xfff) == 0 || (val & ~0xfff000) == 0;
159178 }
160179
180+/* Match a constant valid for logical operations. */
161181 static inline bool is_limm(uint64_t val)
162182 {
163183 /* Taking a simplified view of the logical immediates for now, ignoring
@@ -178,6 +198,106 @@ static inline bool is_limm(uint64_t val)
178198 return (val & (val - 1)) == 0;
179199 }
180200
201+/* Match a constant that is valid for vectors. */
202+static bool is_fimm(uint64_t v64, int *op, int *cmode, int *imm8)
203+{
204+ int i;
205+
206+ *op = 0;
207+ /* Match replication across 8 bits. */
208+ if (v64 == dup_const(MO_8, v64)) {
209+ *cmode = 0xe;
210+ *imm8 = v64 & 0xff;
211+ return true;
212+ }
213+ /* Match replication across 16 bits. */
214+ if (v64 == dup_const(MO_16, v64)) {
215+ uint16_t v16 = v64;
216+
217+ if (v16 == (v16 & 0xff)) {
218+ *cmode = 0x8;
219+ *imm8 = v16 & 0xff;
220+ return true;
221+ } else if (v16 == (v16 & 0xff00)) {
222+ *cmode = 0xa;
223+ *imm8 = v16 >> 8;
224+ return true;
225+ }
226+ }
227+ /* Match replication across 32 bits. */
228+ if (v64 == dup_const(MO_32, v64)) {
229+ uint32_t v32 = v64;
230+
231+ if (v32 == (v32 & 0xff)) {
232+ *cmode = 0x0;
233+ *imm8 = v32 & 0xff;
234+ return true;
235+ } else if (v32 == (v32 & 0xff00)) {
236+ *cmode = 0x2;
237+ *imm8 = (v32 >> 8) & 0xff;
238+ return true;
239+ } else if (v32 == (v32 & 0xff0000)) {
240+ *cmode = 0x4;
241+ *imm8 = (v32 >> 16) & 0xff;
242+ return true;
243+ } else if (v32 == (v32 & 0xff000000)) {
244+ *cmode = 0x6;
245+ *imm8 = v32 >> 24;
246+ return true;
247+ } else if ((v32 & 0xffff00ff) == 0xff) {
248+ *cmode = 0xc;
249+ *imm8 = (v32 >> 8) & 0xff;
250+ return true;
251+ } else if ((v32 & 0xff00ffff) == 0xffff) {
252+ *cmode = 0xd;
253+ *imm8 = (v32 >> 16) & 0xff;
254+ return true;
255+ }
256+ /* Match forms of a float32. */
257+ if (extract32(v32, 0, 19) == 0
258+ && (extract32(v32, 25, 6) == 0x20
259+ || extract32(v32, 25, 6) == 0x1f)) {
260+ *cmode = 0xf;
261+ *imm8 = (extract32(v32, 31, 1) << 7)
262+ | (extract32(v32, 25, 1) << 6)
263+ | extract32(v32, 19, 6);
264+ return true;
265+ }
266+ }
267+ /* Match forms of a float64. */
268+ if (extract64(v64, 0, 48) == 0
269+ && (extract64(v64, 54, 9) == 0x100
270+ || extract64(v64, 54, 9) == 0x0ff)) {
271+ *cmode = 0xf;
272+ *op = 1;
273+ *imm8 = (extract64(v64, 63, 1) << 7)
274+ | (extract64(v64, 54, 1) << 6)
275+ | extract64(v64, 48, 6);
276+ return true;
277+ }
278+ /* Match bytes of 0x00 and 0xff. */
279+ for (i = 0; i < 64; i += 8) {
280+ uint64_t byte = extract64(v64, i, 8);
281+ if (byte != 0 && byte != 0xff) {
282+ break;
283+ }
284+ }
285+ if (i == 64) {
286+ *cmode = 0xe;
287+ *op = 1;
288+ *imm8 = (extract64(v64, 0, 1) << 0)
289+ | (extract64(v64, 8, 1) << 1)
290+ | (extract64(v64, 16, 1) << 2)
291+ | (extract64(v64, 24, 1) << 3)
292+ | (extract64(v64, 32, 1) << 4)
293+ | (extract64(v64, 40, 1) << 5)
294+ | (extract64(v64, 48, 1) << 6)
295+ | (extract64(v64, 56, 1) << 7);
296+ return true;
297+ }
298+ return false;
299+}
300+
181301 static int tcg_target_const_match(tcg_target_long val, TCGType type,
182302 const TCGArgConstraint *arg_ct)
183303 {
@@ -271,6 +391,9 @@ typedef enum {
271391
272392 /* Load literal for loading the address at pc-relative offset */
273393 I3305_LDR = 0x58000000,
394+ I3305_LDR_v64 = 0x5c000000,
395+ I3305_LDR_v128 = 0x9c000000,
396+
274397 /* Load/store register. Described here as 3.3.12, but the helper
275398 that emits them can transform to 3.3.10 or 3.3.13. */
276399 I3312_STRB = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
@@ -290,6 +413,15 @@ typedef enum {
290413 I3312_LDRSHX = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
291414 I3312_LDRSWX = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
292415
416+ I3312_LDRVS = 0x3c000000 | LDST_LD << 22 | MO_32 << 30,
417+ I3312_STRVS = 0x3c000000 | LDST_ST << 22 | MO_32 << 30,
418+
419+ I3312_LDRVD = 0x3c000000 | LDST_LD << 22 | MO_64 << 30,
420+ I3312_STRVD = 0x3c000000 | LDST_ST << 22 | MO_64 << 30,
421+
422+ I3312_LDRVQ = 0x3c000000 | 3 << 22 | 0 << 30,
423+ I3312_STRVQ = 0x3c000000 | 2 << 22 | 0 << 30,
424+
293425 I3312_TO_I3310 = 0x00200800,
294426 I3312_TO_I3313 = 0x01000000,
295427
@@ -374,8 +506,48 @@ typedef enum {
374506 I3510_EON = 0x4a200000,
375507 I3510_ANDS = 0x6a000000,
376508
377- NOP = 0xd503201f,
509+ /* AdvSIMD copy */
510+ I3605_DUP = 0x0e000400,
511+ I3605_INS = 0x4e001c00,
512+ I3605_UMOV = 0x0e003c00,
513+
514+ /* AdvSIMD modified immediate */
515+ I3606_MOVI = 0x0f000400,
516+
517+ /* AdvSIMD shift by immediate */
518+ I3614_SSHR = 0x0f000400,
519+ I3614_SSRA = 0x0f001400,
520+ I3614_SHL = 0x0f005400,
521+ I3614_USHR = 0x2f000400,
522+ I3614_USRA = 0x2f001400,
523+
524+ /* AdvSIMD three same. */
525+ I3616_ADD = 0x0e208400,
526+ I3616_AND = 0x0e201c00,
527+ I3616_BIC = 0x0e601c00,
528+ I3616_EOR = 0x2e201c00,
529+ I3616_MUL = 0x0e209c00,
530+ I3616_ORR = 0x0ea01c00,
531+ I3616_ORN = 0x0ee01c00,
532+ I3616_SUB = 0x2e208400,
533+ I3616_CMGT = 0x0e203400,
534+ I3616_CMGE = 0x0e203c00,
535+ I3616_CMTST = 0x0e208c00,
536+ I3616_CMHI = 0x2e203400,
537+ I3616_CMHS = 0x2e203c00,
538+ I3616_CMEQ = 0x2e208c00,
539+
540+ /* AdvSIMD two-reg misc. */
541+ I3617_CMGT0 = 0x0e208800,
542+ I3617_CMEQ0 = 0x0e209800,
543+ I3617_CMLT0 = 0x0e20a800,
544+ I3617_CMGE0 = 0x2e208800,
545+ I3617_CMLE0 = 0x2e20a800,
546+ I3617_NOT = 0x2e205800,
547+ I3617_NEG = 0x2e20b800,
548+
378549 /* System instructions. */
550+ NOP = 0xd503201f,
379551 DMB_ISH = 0xd50338bf,
380552 DMB_LD = 0x00000100,
381553 DMB_ST = 0x00000200,
@@ -520,26 +692,64 @@ static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext,
520692 tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd);
521693 }
522694
695+static void tcg_out_insn_3605(TCGContext *s, AArch64Insn insn, bool q,
696+ TCGReg rd, TCGReg rn, int dst_idx, int src_idx)
697+{
698+ /* Note that bit 11 set means general register input. Therefore
699+ we can handle both register sets with one function. */
700+ tcg_out32(s, insn | q << 30 | (dst_idx << 16) | (src_idx << 11)
701+ | (rd & 0x1f) | (~rn & 0x20) << 6 | (rn & 0x1f) << 5);
702+}
703+
704+static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
705+ TCGReg rd, bool op, int cmode, uint8_t imm8)
706+{
707+ tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f)
708+ | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
709+}
710+
711+static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
712+ TCGReg rd, TCGReg rn, unsigned immhb)
713+{
714+ tcg_out32(s, insn | q << 30 | immhb << 16
715+ | (rn & 0x1f) << 5 | (rd & 0x1f));
716+}
717+
718+static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q,
719+ unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
720+{
721+ tcg_out32(s, insn | q << 30 | (size << 22) | (rm & 0x1f) << 16
722+ | (rn & 0x1f) << 5 | (rd & 0x1f));
723+}
724+
725+static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
726+ unsigned size, TCGReg rd, TCGReg rn)
727+{
728+ tcg_out32(s, insn | q << 30 | (size << 22)
729+ | (rn & 0x1f) << 5 | (rd & 0x1f));
730+}
731+
523732 static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
524733 TCGReg rd, TCGReg base, TCGType ext,
525734 TCGReg regoff)
526735 {
527736 /* Note the AArch64Insn constants above are for C3.3.12. Adjust. */
528737 tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
529- 0x4000 | ext << 13 | base << 5 | rd);
738+ 0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
530739 }
531740
532741 static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
533742 TCGReg rd, TCGReg rn, intptr_t offset)
534743 {
535- tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | rd);
744+ tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f));
536745 }
537746
538747 static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
539748 TCGReg rd, TCGReg rn, uintptr_t scaled_uimm)
540749 {
541750 /* Note the AArch64Insn constants above are for C3.3.12. Adjust. */
542- tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 | rn << 5 | rd);
751+ tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10
752+ | rn << 5 | (rd & 0x1f));
543753 }
544754
545755 /* Register to register move using ORR (shifted register with no shift). */
@@ -585,6 +795,22 @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
585795 tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
586796 }
587797
798+static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
799+ TCGReg rd, uint64_t v64)
800+{
801+ int op, cmode, imm8;
802+
803+ if (is_fimm(v64, &op, &cmode, &imm8)) {
804+ tcg_out_insn(s, 3606, MOVI, type == TCG_TYPE_V128, rd, op, cmode, imm8);
805+ } else if (type == TCG_TYPE_V128) {
806+ new_pool_l2(s, R_AARCH64_CONDBR19, s->code_ptr, 0, v64, v64);
807+ tcg_out_insn(s, 3305, LDR_v128, 0, rd);
808+ } else {
809+ new_pool_label(s, v64, R_AARCH64_CONDBR19, s->code_ptr, 0);
810+ tcg_out_insn(s, 3305, LDR_v64, 0, rd);
811+ }
812+}
813+
588814 static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
589815 tcg_target_long value)
590816 {
@@ -594,6 +820,22 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
594820 int s0, s1;
595821 AArch64Insn opc;
596822
823+ switch (type) {
824+ case TCG_TYPE_I32:
825+ case TCG_TYPE_I64:
826+ tcg_debug_assert(rd < 32);
827+ break;
828+
829+ case TCG_TYPE_V64:
830+ case TCG_TYPE_V128:
831+ tcg_debug_assert(rd >= 32);
832+ tcg_out_dupi_vec(s, type, rd, value);
833+ return;
834+
835+ default:
836+ g_assert_not_reached();
837+ }
838+
597839 /* For 32-bit values, discard potential garbage in value. For 64-bit
598840 values within [2**31, 2**32-1], we can create smaller sequences by
599841 interpreting this as a negative 32-bit number, while ensuring that
@@ -669,15 +911,13 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
669911 /* Define something more legible for general use. */
670912 #define tcg_out_ldst_r tcg_out_insn_3310
671913
672-static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
673- TCGReg rd, TCGReg rn, intptr_t offset)
914+static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
915+ TCGReg rn, intptr_t offset, int lgsize)
674916 {
675- TCGMemOp size = (uint32_t)insn >> 30;
676-
677917 /* If the offset is naturally aligned and in range, then we can
678918 use the scaled uimm12 encoding */
679- if (offset >= 0 && !(offset & ((1 << size) - 1))) {
680- uintptr_t scaled_uimm = offset >> size;
919+ if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) {
920+ uintptr_t scaled_uimm = offset >> lgsize;
681921 if (scaled_uimm <= 0xfff) {
682922 tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm);
683923 return;
@@ -695,32 +935,102 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
695935 tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
696936 }
697937
698-static inline void tcg_out_mov(TCGContext *s,
699- TCGType type, TCGReg ret, TCGReg arg)
938+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
700939 {
701- if (ret != arg) {
702- tcg_out_movr(s, type, ret, arg);
940+ if (ret == arg) {
941+ return;
942+ }
943+ switch (type) {
944+ case TCG_TYPE_I32:
945+ case TCG_TYPE_I64:
946+ if (ret < 32 && arg < 32) {
947+ tcg_out_movr(s, type, ret, arg);
948+ break;
949+ } else if (ret < 32) {
950+ tcg_out_insn(s, 3605, UMOV, type, ret, arg, 0, 0);
951+ break;
952+ } else if (arg < 32) {
953+ tcg_out_insn(s, 3605, INS, 0, ret, arg, 4 << type, 0);
954+ break;
955+ }
956+ /* FALLTHRU */
957+
958+ case TCG_TYPE_V64:
959+ tcg_debug_assert(ret >= 32 && arg >= 32);
960+ tcg_out_insn(s, 3616, ORR, 0, 0, ret, arg, arg);
961+ break;
962+ case TCG_TYPE_V128:
963+ tcg_debug_assert(ret >= 32 && arg >= 32);
964+ tcg_out_insn(s, 3616, ORR, 1, 0, ret, arg, arg);
965+ break;
966+
967+ default:
968+ g_assert_not_reached();
703969 }
704970 }
705971
706-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
707- TCGReg arg1, intptr_t arg2)
972+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
973+ TCGReg base, intptr_t ofs)
708974 {
709- tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_LDRW : I3312_LDRX,
710- arg, arg1, arg2);
975+ AArch64Insn insn;
976+ int lgsz;
977+
978+ switch (type) {
979+ case TCG_TYPE_I32:
980+ insn = (ret < 32 ? I3312_LDRW : I3312_LDRVS);
981+ lgsz = 2;
982+ break;
983+ case TCG_TYPE_I64:
984+ insn = (ret < 32 ? I3312_LDRX : I3312_LDRVD);
985+ lgsz = 3;
986+ break;
987+ case TCG_TYPE_V64:
988+ insn = I3312_LDRVD;
989+ lgsz = 3;
990+ break;
991+ case TCG_TYPE_V128:
992+ insn = I3312_LDRVQ;
993+ lgsz = 4;
994+ break;
995+ default:
996+ g_assert_not_reached();
997+ }
998+ tcg_out_ldst(s, insn, ret, base, ofs, lgsz);
711999 }
7121000
713-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
714- TCGReg arg1, intptr_t arg2)
1001+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg src,
1002+ TCGReg base, intptr_t ofs)
7151003 {
716- tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_STRW : I3312_STRX,
717- arg, arg1, arg2);
1004+ AArch64Insn insn;
1005+ int lgsz;
1006+
1007+ switch (type) {
1008+ case TCG_TYPE_I32:
1009+ insn = (src < 32 ? I3312_STRW : I3312_STRVS);
1010+ lgsz = 2;
1011+ break;
1012+ case TCG_TYPE_I64:
1013+ insn = (src < 32 ? I3312_STRX : I3312_STRVD);
1014+ lgsz = 3;
1015+ break;
1016+ case TCG_TYPE_V64:
1017+ insn = I3312_STRVD;
1018+ lgsz = 3;
1019+ break;
1020+ case TCG_TYPE_V128:
1021+ insn = I3312_STRVQ;
1022+ lgsz = 4;
1023+ break;
1024+ default:
1025+ g_assert_not_reached();
1026+ }
1027+ tcg_out_ldst(s, insn, src, base, ofs, lgsz);
7181028 }
7191029
7201030 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
7211031 TCGReg base, intptr_t ofs)
7221032 {
723- if (val == 0) {
1033+ if (type <= TCG_TYPE_I64 && val == 0) {
7241034 tcg_out_st(s, type, TCG_REG_XZR, base, ofs);
7251035 return true;
7261036 }
@@ -1210,14 +1520,15 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
12101520 /* Merge "low bits" from tlb offset, load the tlb comparator into X0.
12111521 X0 = load [X2 + (tlb_offset & 0x000fff)] */
12121522 tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX,
1213- TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
1523+ TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff,
1524+ TARGET_LONG_BITS == 32 ? 2 : 3);
12141525
12151526 /* Load the tlb addend. Do that early to avoid stalling.
12161527 X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
12171528 tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2,
12181529 (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
12191530 (is_read ? offsetof(CPUTLBEntry, addr_read)
1220- : offsetof(CPUTLBEntry, addr_write)));
1531+ : offsetof(CPUTLBEntry, addr_write)), 3);
12211532
12221533 /* Perform the address comparison. */
12231534 tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
@@ -1435,49 +1746,49 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
14351746
14361747 case INDEX_op_ld8u_i32:
14371748 case INDEX_op_ld8u_i64:
1438- tcg_out_ldst(s, I3312_LDRB, a0, a1, a2);
1749+ tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0);
14391750 break;
14401751 case INDEX_op_ld8s_i32:
1441- tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2);
1752+ tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0);
14421753 break;
14431754 case INDEX_op_ld8s_i64:
1444- tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2);
1755+ tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0);
14451756 break;
14461757 case INDEX_op_ld16u_i32:
14471758 case INDEX_op_ld16u_i64:
1448- tcg_out_ldst(s, I3312_LDRH, a0, a1, a2);
1759+ tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1);
14491760 break;
14501761 case INDEX_op_ld16s_i32:
1451- tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2);
1762+ tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1);
14521763 break;
14531764 case INDEX_op_ld16s_i64:
1454- tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2);
1765+ tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1);
14551766 break;
14561767 case INDEX_op_ld_i32:
14571768 case INDEX_op_ld32u_i64:
1458- tcg_out_ldst(s, I3312_LDRW, a0, a1, a2);
1769+ tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2);
14591770 break;
14601771 case INDEX_op_ld32s_i64:
1461- tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2);
1772+ tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2);
14621773 break;
14631774 case INDEX_op_ld_i64:
1464- tcg_out_ldst(s, I3312_LDRX, a0, a1, a2);
1775+ tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3);
14651776 break;
14661777
14671778 case INDEX_op_st8_i32:
14681779 case INDEX_op_st8_i64:
1469- tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2);
1780+ tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0);
14701781 break;
14711782 case INDEX_op_st16_i32:
14721783 case INDEX_op_st16_i64:
1473- tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2);
1784+ tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1);
14741785 break;
14751786 case INDEX_op_st_i32:
14761787 case INDEX_op_st32_i64:
1477- tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2);
1788+ tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2);
14781789 break;
14791790 case INDEX_op_st_i64:
1480- tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2);
1791+ tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3);
14811792 break;
14821793
14831794 case INDEX_op_add_i32:
@@ -1776,25 +2087,176 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
17762087
17772088 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
17782089 case INDEX_op_mov_i64:
2090+ case INDEX_op_mov_vec:
17792091 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
17802092 case INDEX_op_movi_i64:
2093+ case INDEX_op_dupi_vec:
17812094 case INDEX_op_call: /* Always emitted via tcg_out_call. */
17822095 default:
1783- tcg_abort();
2096+ g_assert_not_reached();
17842097 }
17852098
17862099 #undef REG0
17872100 }
17882101
2102+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2103+ unsigned vecl, unsigned vece,
2104+ const TCGArg *args, const int *const_args)
2105+{
2106+ static const AArch64Insn cmp_insn[16] = {
2107+ [TCG_COND_EQ] = I3616_CMEQ,
2108+ [TCG_COND_GT] = I3616_CMGT,
2109+ [TCG_COND_GE] = I3616_CMGE,
2110+ [TCG_COND_GTU] = I3616_CMHI,
2111+ [TCG_COND_GEU] = I3616_CMHS,
2112+ };
2113+ static const AArch64Insn cmp0_insn[16] = {
2114+ [TCG_COND_EQ] = I3617_CMEQ0,
2115+ [TCG_COND_GT] = I3617_CMGT0,
2116+ [TCG_COND_GE] = I3617_CMGE0,
2117+ [TCG_COND_LT] = I3617_CMLT0,
2118+ [TCG_COND_LE] = I3617_CMLE0,
2119+ };
2120+
2121+ TCGType type = vecl + TCG_TYPE_V64;
2122+ unsigned is_q = vecl;
2123+ TCGArg a0, a1, a2;
2124+
2125+ a0 = args[0];
2126+ a1 = args[1];
2127+ a2 = args[2];
2128+
2129+ switch (opc) {
2130+ case INDEX_op_ld_vec:
2131+ tcg_out_ld(s, type, a0, a1, a2);
2132+ break;
2133+ case INDEX_op_st_vec:
2134+ tcg_out_st(s, type, a0, a1, a2);
2135+ break;
2136+ case INDEX_op_add_vec:
2137+ tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
2138+ break;
2139+ case INDEX_op_sub_vec:
2140+ tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
2141+ break;
2142+ case INDEX_op_mul_vec:
2143+ tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
2144+ break;
2145+ case INDEX_op_neg_vec:
2146+ tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
2147+ break;
2148+ case INDEX_op_and_vec:
2149+ tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2);
2150+ break;
2151+ case INDEX_op_or_vec:
2152+ tcg_out_insn(s, 3616, ORR, is_q, 0, a0, a1, a2);
2153+ break;
2154+ case INDEX_op_xor_vec:
2155+ tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
2156+ break;
2157+ case INDEX_op_andc_vec:
2158+ tcg_out_insn(s, 3616, BIC, is_q, 0, a0, a1, a2);
2159+ break;
2160+ case INDEX_op_orc_vec:
2161+ tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2);
2162+ break;
2163+ case INDEX_op_not_vec:
2164+ tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
2165+ break;
2166+ case INDEX_op_dup_vec:
2167+ tcg_out_insn(s, 3605, DUP, is_q, a0, a1, 1 << vece, 0);
2168+ break;
2169+ case INDEX_op_shli_vec:
2170+ tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
2171+ break;
2172+ case INDEX_op_shri_vec:
2173+ tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
2174+ break;
2175+ case INDEX_op_sari_vec:
2176+ tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
2177+ break;
2178+ case INDEX_op_cmp_vec:
2179+ {
2180+ TCGCond cond = args[3];
2181+ AArch64Insn insn;
2182+
2183+ if (cond == TCG_COND_NE) {
2184+ if (const_args[2]) {
2185+ tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
2186+ } else {
2187+ tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
2188+ tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
2189+ }
2190+ } else {
2191+ if (const_args[2]) {
2192+ insn = cmp0_insn[cond];
2193+ if (insn) {
2194+ tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
2195+ break;
2196+ }
2197+ tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0);
2198+ a2 = TCG_VEC_TMP;
2199+ }
2200+ insn = cmp_insn[cond];
2201+ if (insn == 0) {
2202+ TCGArg t;
2203+ t = a1, a1 = a2, a2 = t;
2204+ cond = tcg_swap_cond(cond);
2205+ insn = cmp_insn[cond];
2206+ tcg_debug_assert(insn != 0);
2207+ }
2208+ tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
2209+ }
2210+ }
2211+ break;
2212+ default:
2213+ g_assert_not_reached();
2214+ }
2215+}
2216+
2217+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
2218+{
2219+ switch (opc) {
2220+ case INDEX_op_add_vec:
2221+ case INDEX_op_sub_vec:
2222+ case INDEX_op_mul_vec:
2223+ case INDEX_op_and_vec:
2224+ case INDEX_op_or_vec:
2225+ case INDEX_op_xor_vec:
2226+ case INDEX_op_andc_vec:
2227+ case INDEX_op_orc_vec:
2228+ case INDEX_op_neg_vec:
2229+ case INDEX_op_not_vec:
2230+ case INDEX_op_cmp_vec:
2231+ case INDEX_op_shli_vec:
2232+ case INDEX_op_shri_vec:
2233+ case INDEX_op_sari_vec:
2234+ return 1;
2235+
2236+ default:
2237+ return 0;
2238+ }
2239+}
2240+
2241+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
2242+ TCGArg a0, ...)
2243+{
2244+}
2245+
17892246 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
17902247 {
17912248 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
17922249 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2250+ static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } };
2251+ static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } };
2252+ static const TCGTargetOpDef w_wr = { .args_ct_str = { "w", "wr" } };
17932253 static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
17942254 static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } };
17952255 static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
17962256 static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } };
17972257 static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
2258+ static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } };
2259+ static const TCGTargetOpDef w_w_wZ = { .args_ct_str = { "w", "w", "wZ" } };
17982260 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
17992261 static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } };
18002262 static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } };
@@ -1938,6 +2400,29 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
19382400 case INDEX_op_sub2_i64:
19392401 return &add2;
19402402
2403+ case INDEX_op_add_vec:
2404+ case INDEX_op_sub_vec:
2405+ case INDEX_op_mul_vec:
2406+ case INDEX_op_and_vec:
2407+ case INDEX_op_or_vec:
2408+ case INDEX_op_xor_vec:
2409+ case INDEX_op_andc_vec:
2410+ case INDEX_op_orc_vec:
2411+ return &w_w_w;
2412+ case INDEX_op_not_vec:
2413+ case INDEX_op_neg_vec:
2414+ case INDEX_op_shli_vec:
2415+ case INDEX_op_shri_vec:
2416+ case INDEX_op_sari_vec:
2417+ return &w_w;
2418+ case INDEX_op_ld_vec:
2419+ case INDEX_op_st_vec:
2420+ return &w_r;
2421+ case INDEX_op_dup_vec:
2422+ return &w_wr;
2423+ case INDEX_op_cmp_vec:
2424+ return &w_w_wZ;
2425+
19412426 default:
19422427 return NULL;
19432428 }
@@ -1947,8 +2432,10 @@ static void tcg_target_init(TCGContext *s)
19472432 {
19482433 tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
19492434 tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
2435+ tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
2436+ tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
19502437
1951- tcg_target_call_clobber_regs = 0xfffffffu;
2438+ tcg_target_call_clobber_regs = -1ull;
19522439 tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19);
19532440 tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20);
19542441 tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21);
@@ -1960,12 +2447,21 @@ static void tcg_target_init(TCGContext *s)
19602447 tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27);
19612448 tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28);
19622449 tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29);
2450+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
2451+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
2452+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
2453+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
2454+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
2455+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
2456+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
2457+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
19632458
19642459 s->reserved_regs = 0;
19652460 tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
19662461 tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP);
19672462 tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
19682463 tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */
2464+ tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
19692465 }
19702466
19712467 /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)). */
--- /dev/null
+++ b/tcg/aarch64/tcg-target.opc.h
@@ -0,0 +1,3 @@
1+/* Target-specific opcodes for host vector expansion. These will be
2+ emitted by tcg_expand_vec_op. For those familiar with GCC internals,
3+ consider these to be UNSPEC with names. */